R語言預測股價之LSTM範例

--- title: 'R語言預測股價之LSTM範例' disqus: hackmd --- R語言預測股價之LSTM範例 === [TOC] ## 流程圖 ```flow st=>start: 股價資料 op=>parallel: 切割訓練集、驗證集、測試集 para=>parallel: parallel tasks op2=>operation: 轉換為LSTM陣列 op3=>operation: 模型訓練 io_traindata=>inputoutput: 訓練、驗證集 io_testdata=>inputoutput: 測試集 io_model=>inputoutput: Keras Model op_predict=>operation: 模型預測 e=>end st->op op(path1)->io_traindata->op2->op3->io_model->op_predict op(path2, right, botto)->io_testdata->op_predict ``` ## 載入所需套件 ``` R= rm(list=ls());gc() library(keras) # 深度學習Keras套件 library(tidyquant) # 抓取股價資料套件 library(dplyr) # 資料篩選套件 library(TTR) # 計算股票技術指標套件 library(caret) # 資料預處理套件 library(parallel) # 平行運算套件 library(abind) # list轉換成data.frame所需套件 library(tidyr) # 轉換長寬資料套件 library(ggplot2) # 資料視覺化套件 ``` ## 抓取股價資料首先抓取Yahoo Finance的股價資料，此處以台積電為例 ``` R= download.stockCode <- c("2330.TW") stockData <- tq_get(x = download.stockCode, get = "stock.prices", from = "2008-01-01") summary(stockData) ``` ## 模型建立 ### 定義訓練集、驗證集、測試集時間範圍預測股票報酬/股價切割時間相當重要，本範例將使用移動窗口(Rolling Window)，於每年中的1月1日與7月1日進行模型訓練。訓練樣本時間長度為5年，驗證樣本時間長度為6個月，測試集樣本時間長度為6個月。 ```mermaid gantt title Rolling Window section 模型1 訓練期(5年) :a1, 2009-07-01, 2014-07-01 驗證期(半年) :a2, 2014-07-01, 2015-01-01 測試期(半年) :a3, 2015-01-01, 2015-06-30 section 模型2 訓練期(5年) :a1, 2010-01-01, 2015-01-01 驗證期(半年) :a2, 2015-01-01, 2015-06-30 測試期(半年) :a3, 2015-07-01, 2015-12-31 ``` ``` R= # 訓練樣本時間長度 train_timeSpan <- lubridate::years(5) # 驗證樣本時間長度 validation_timeSpan <- months(6) # 模型重訓練日期 predictDateList <- lapply(paste0(c(2015:max(year(stockData$date)))) , function(x)paste0(x, c("0101", "0701"))) %>% unlist() %>% ymd() %>% sort() ``` ### 定義單一樣本時間長度由於LSTM之單一樣本維度定義為$dim(timeSteps, featureNums)$，從Yahoo Finance抓取之股價資料為日資料，而LSTM Model單一樣本中列資料與列資料間隔時間只要是固定時間即可，不必一定要間隔一日，也可以是二日、三日。本例將以近5日的調整後盤價作為單一樣本 ``` R= # 樣本總時間長度 lookback <- 5 # 樣本列間時間長度 step <- 1 # 時間步長 timeStepNums <- floor(lookback / step) ``` 建立資料轉換函數與模型建立函數 ```R= # convert data.frame to LSTM array to_lstm_array <- function(start_ind, data.frame, lookback = lookback, timeStepNums = timeStepNums){ # 特徵數量 featureNums <- ncol(data.frame) x <- data.frame[seq(from = (start_ind-lookback), to = start_ind, length.out = timeStepNums),] %>% as.matrix() %>% array(dim=c(1, timeStepNums , featureNums)) return(x) } # 模型建立函數 build_lstm_model <- function(timeStepNums = timeStepNums, featureNums = featureNums){ model <- keras_model_sequential() model %>% layer_lstm(units=10, input_shape=c(timeStepNums, featureNums), return_sequences=T) %>% layer_dropout(0.5) %>% layer_lstm(units=32, input_shape=c(timeStepNums, featureNums), return_sequences=F) %>% layer_dropout(0.5) %>% layer_dense(units=10, activation="relu") %>% layer_dense(units=1, activation="relu") model %>% compile( loss="mse", optimizer="adam", metrics="mse" ) return(model) } ``` ## 資料預處理在預處理上主要分為兩項步驟 1.建立特徵、標籤 2.資料正規化(normalization) ### 1.建立模型特徵、預測目標 ``` R= stockData <- stockData %>% na.omit() %>% arrange(date) %>% select(date, close = adjusted) %>% mutate(Y_Label = lead(close, 1), Y_outDate = lead(date, 1)) %>% na.omit() ``` 為簡化特徵，本範例單純以調整後收盤價作為特徵預測隔日調整後收盤價，並且記錄標籤日期預防使用未來資料。 ### 2.資料正規化(normalization) 本文使用Caret套件中的preProcess函數，針對訓練集進行標準化，並使用訓練集建立之模型預測將驗證集、測試集進行正規化。另外建立逆正規化函數，以便將預測值還原。 ```R= # 逆向標準化函數 unPreProc <- function(preProc, data){ stopifnot(class(preProc) == "preProcess") stopifnot("data.frame" %in% class(data)) for(i in names(preProc$mean)){ tmp <- data[, i] * preProc$std[[i]] + preProc$mean[[i]] data[, i] <- tmp } return(data) } ``` ## 模型訓練每半年重新訓練模型 ``` R= # 依日期進行模型訓練 dailyList <- stockData$date %>% unique() %>% sort() allInfo <- tibble() ind <- 1 for(ind in 1:length(predictDateList)){ cat(paste0("目前預測日期: ", predictDateList[ind], ", 目前進度: ", ind, " / ", length(predictDateList) ) , "\n") # 測試期期間 testData.StartDate <- dailyList[max(which(dailyList <= predictDateList[ind])) - lookback] testData.endDate <- as.Date(ifelse(ind != length(predictDateList), predictDateList[ind+1], max(stockData$date))) # 驗證期期間 validData.endDate <- predictDateList[ind] validData.StartDate <- dailyList[max(which(dailyList <= validData.endDate - validation_timeSpan)) - lookback] # 訓練期期間 trainData.endDate <- predictDateList[ind] - validation_timeSpan trainData.StartDate <- dailyList[max(which(dailyList <= trainData.endDate - train_timeSpan)) - lookback] # 切割資料 testInfo <- stockData %>% filter(date >= testData.StartDate, date < testData.endDate) %>% select(c(date, Y_outDate, Y_Label)) X_test <- stockData %>% filter(date >= testData.StartDate, date < testData.endDate) %>% select(-c(date, Y_outDate, Y_Label)) Y_test <- stockData %>% filter(date >= testData.StartDate, date < testData.endDate) %>% select(Y_Label) X_val <- stockData %>% filter(date >= validData.StartDate, Y_outDate < validData.endDate) %>% select(-c(date, Y_outDate, Y_Label)) Y_val <- stockData %>% filter(date >= validData.StartDate, Y_outDate < validData.endDate) %>% select(Y_Label) X_train <- stockData %>% filter(date >= trainData.StartDate, Y_outDate < trainData.endDate) %>% select(-c(date, Y_outDate, Y_Label)) Y_train <- stockData %>% filter(date >= trainData.StartDate, Y_outDate < trainData.endDate) %>% select(Y_Label) # 將資料特徵進行標準化 # 使用訓練集樣本建立各特徵標準化模型 # center: x - mean(x) # scale: x / std(x) preProcValues <- preProcess(X_train, method = c("center", "scale")) X_train <- predict(preProcValues, X_train) X_val <- predict(preProcValues, X_val) X_test <- predict(preProcValues, X_test) # unPreProc(preProcValues, X_train) preProcY <- preProcess(Y_train, method = c("center", "scale")) Y_train <- predict(preProcY, Y_train) Y_val <- predict(preProcY, Y_val) # 將資料結構轉換為LSTM陣列 # 使用CPU執行緒數目 cut_Cluster<- detectCores()-6 　 cl <- makeCluster(cut_Cluster) # 匯入使用變數給每個執行緒 # clusterExport(cl, c("X_train","Y_train")) # 匯入套件給每個執行緒 clusterEvalQ(cl, c(library(dplyr))) # 執行平行運算 output_train <- parLapply(cl, (lookback+1):nrow(X_train), to_lstm_array, X_train, lookback, timeStepNums) output_val <- parLapply(cl, (lookback+1):nrow(X_val), to_lstm_array, X_val, lookback, timeStepNums) output_test <- parLapply(cl, (lookback+1):nrow(X_test), to_lstm_array, X_test, lookback, timeStepNums) # 關閉平行運算 stopCluster(cl) # 合併樣本集 X_train <- do.call("abind", c(output_train, along=1)) Y_train <- Y_train[(lookback+1):nrow(Y_train),] %>% as.matrix() X_val <- do.call("abind", c(output_val, along=1)) Y_val <- Y_val[(lookback+1):nrow(Y_val),] %>% as.matrix() X_test <- do.call("abind", c(output_test, along=1)) Y_test <- Y_test[(lookback+1):nrow(Y_test),] %>% as.matrix() testInfo <- testInfo[(lookback+1):nrow(testInfo),] # 建立模型 # 樣本維度 cat("樣本集資料:", dim(X_train), "\n") model <- build_lstm_model(timeStepNums = dim(X_train)[2], featureNums = dim(X_train)[3]) # 檢查模型 summary(model) # 開始訓練 batch_size <- 50 epochs <- 50 patience <- 20 trainHistory <- model %>% fit( x=X_train, y=Y_train, validation_data = list(X_val, Y_val), batch_size=batch_size, epochs=epochs, shuffle=T, callbacks = c(callback_early_stopping(monitor = "val_loss", patience = patience, mode = "auto")) ) # final epoch final_epoch <- trainHistory$metrics$loss %>% length() %>% max() final_epoch <- ifelse(final_epoch != epochs, epochs - patience, epochs) # 合併訓練集與驗證集，使用最佳epoch訓練模型 model <- build_lstm_model(timeStepNums = dim(X_train)[2], featureNums = dim(X_train)[3]) # 開始訓練 trainHistory <- model %>% fit( x = abind(X_train, X_val , along = 1), y = abind(Y_train, Y_val , along = 1), validation_data = list(X_val, Y_val), batch_size = batch_size, epochs = final_epoch, shuffle = T ) # 儲存模型 model_fileName <- paste0("stockPrediction_LSTM_", predictDateList[ind], ".h5") model %>% save_model_hdf5(model_fileName) # 預測測試期資料 Y_predict <- model %>% predict(X_test) %>% as.data.frame() %>% rename_at(vars(colnames(.)), ~ colnames(Y_train)) # 還原Y預測值 Y_predict <- unPreProc(preProcY, Y_predict) # 合併測試期資料與預測值 testInfo <- testInfo %>% bind_cols(Y_predict) # 儲存測試期資料 allInfo <- allInfo %>% bind_rows(testInfo) } ``` ## 資料視覺化 ``` R= # 資料視覺化 plotData = allInfo %>% rename(Y_predict = Y_Label1, Y_real = Y_Label) %>% gather(category, adjclose, Y_real, Y_predict) plotData$category = factor(plotData$category , levels = c("Y_real" , "Y_predict")) ggplot(data = plotData , aes(x = Y_outDate, y = round(adjclose, 2) , color= category))+ geom_line(aes(color = category ), size = 1.2)+ labs(title = paste0("調整後收盤價實際與預測走勢圖")) + ylab("調整後收盤價")+ theme_bw() + theme(plot.title = element_text(hjust = 0.5, size = 16), plot.subtitle = element_text(hjust = 0.5, size = 16), text = element_text(family = "BL", size = 14), legend.position="bottom") ``` ###### tags: `R` `LSTM` `Keras` `Stock_Prediction`