MTSAD basline model intro

# MTSAD basline model intro ###### tags: `Notes` ##### Author: Jay Su ## model detail * Spatial --> PCA + DBSCAN * Temperal --> Forecaster (ARIMA based) ## python code ``` python= import os import time import warnings warnings.simplefilter("ignore", UserWarning) import pandas as pd import numpy as np import matplotlib.pyplot as plt from datetime import datetime, timedelta from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.cluster import DBSCAN from statsmodels.tsa.arima.model import ARIMA # Change it into your data path and file name DATA_PATH = '/Users/jaysu/Desktop/workspace/Caloudi/MTSAD/data_loader/outputdata/5/' FILE_NAME = 'chinamotor-1.csv' DB_EPSLON = 3.3 #spatial threshold FR_THRESHOLD = 99.5 #temperal threshold # Spatial anomaly detector based on PCA and DBSCAN def PCA_DB(data): scaler = StandardScaler().fit(data) X_scaled = scaler.transform(data) pca = PCA(n_components=2) projected = pca.fit_transform(X_scaled) clustering = DBSCAN(eps=DB_EPSLON, min_samples=2).fit(projected) label = clustering.labels_ group = np.array(label) scatter_x = projected[:, 0] scatter_y = projected[:, 1] return label # Temperal anomaly detector based on ARIMA based forecaster def calculate_residual(y_true, y_pred): err = [] for i in range(len(y_pred)): err.append(y_true[i] - y_pred[i]) return err def forecast_residual(sequence): train = np.array(sequence) k = 7 # window size pred = [0] for i in range(1, len(train)): if i <= k: pred.append(np.sum(train[:i]) / i) else: pred.append(np.sum(train[i - k:i]) / k) pred = np.array(pred) true = np.array(train) residual = calculate_residual(true, pred) return residual def FR(sequence): default_threshold = FR_THRESHOLD train = sequence # model fit model = ARIMA(train, order=(0, 0, 1)) model = model.fit() pred = model.predict(0, len(train) - 1, typ='levels') score = np.abs(pred - train) score = abs(np.array(forecast_residual(sequence))) residual = forecast_residual(sequence) anomaly_indexes = np.where(score > np.percentile(score, default_threshold))[0] return anomaly_indexes # plot the results figire def result_visualize(Ss, St, data): count = [list(Ss).count(a) for a in (list(np.unique(Ss)))] label_S = list(np.unique(Ss))[count.index(max(count))] fig, axs = plt.subplots(data.shape[1], 1) for i in range(data.shape[1]): axs[i].plot(data[:,i]) axs[i].vlines(np.where(Ss!=label_S),ymin=np.min(data[:,i]),ymax=np.max(data[:,i]), color='r') axs[i].vlines(St,ymin=np.min(data[:,i]),ymax=np.max(data[:,i]), color='y') fig.legend([data, Ss, St], labels=['InputMT data','Spatial_anomaly','Temperol_anomaly'], loc="upper right") plt.show() return 0 # main fuction def main(): start = time.time() dataset = pd.read_csv(DATA_PATH+FILE_NAME, index_col=0).values score_S = PCA_DB(dataset) score_T = FR(dataset[:,-2]) end = time.time() result_visualize(score_S,score_T,dataset ) print("time spend="+str((end-start)*1000)+" milisecond") if __name__ == '__main__': #excute model main() ``` ## Future work * Better combination of hybrid model * Rule based postproseccor