# MTSAD basline model intro
###### tags: `Notes`
##### Author: Jay Su
## model detail
* Spatial --> PCA + DBSCAN
* Temperal --> Forecaster (ARIMA based)
## python code
``` python=
import os
import time
import warnings
warnings.simplefilter("ignore", UserWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from statsmodels.tsa.arima.model import ARIMA
# Change it into your data path and file name
DATA_PATH = '/Users/jaysu/Desktop/workspace/Caloudi/MTSAD/data_loader/outputdata/5/'
FILE_NAME = 'chinamotor-1.csv'
DB_EPSLON = 3.3 #spatial threshold
FR_THRESHOLD = 99.5 #temperal threshold
# Spatial anomaly detector based on PCA and DBSCAN
def PCA_DB(data):
scaler = StandardScaler().fit(data)
X_scaled = scaler.transform(data)
pca = PCA(n_components=2)
projected = pca.fit_transform(X_scaled)
clustering = DBSCAN(eps=DB_EPSLON, min_samples=2).fit(projected)
label = clustering.labels_
group = np.array(label)
scatter_x = projected[:, 0]
scatter_y = projected[:, 1]
return label
# Temperal anomaly detector based on ARIMA based forecaster
def calculate_residual(y_true, y_pred):
err = []
for i in range(len(y_pred)):
err.append(y_true[i] - y_pred[i])
return err
def forecast_residual(sequence):
train = np.array(sequence)
k = 7 # window size
pred = [0]
for i in range(1, len(train)):
if i <= k:
pred.append(np.sum(train[:i]) / i)
else:
pred.append(np.sum(train[i - k:i]) / k)
pred = np.array(pred)
true = np.array(train)
residual = calculate_residual(true, pred)
return residual
def FR(sequence):
default_threshold = FR_THRESHOLD
train = sequence
# model fit
model = ARIMA(train, order=(0, 0, 1))
model = model.fit()
pred = model.predict(0, len(train) - 1, typ='levels')
score = np.abs(pred - train)
score = abs(np.array(forecast_residual(sequence)))
residual = forecast_residual(sequence)
anomaly_indexes = np.where(score > np.percentile(score, default_threshold))[0]
return anomaly_indexes
# plot the results figire
def result_visualize(Ss, St, data):
count = [list(Ss).count(a) for a in (list(np.unique(Ss)))]
label_S = list(np.unique(Ss))[count.index(max(count))]
fig, axs = plt.subplots(data.shape[1], 1)
for i in range(data.shape[1]):
axs[i].plot(data[:,i])
axs[i].vlines(np.where(Ss!=label_S),ymin=np.min(data[:,i]),ymax=np.max(data[:,i]), color='r')
axs[i].vlines(St,ymin=np.min(data[:,i]),ymax=np.max(data[:,i]), color='y')
fig.legend([data, Ss, St], labels=['InputMT data','Spatial_anomaly','Temperol_anomaly'], loc="upper right")
plt.show()
return 0
# main fuction
def main():
start = time.time()
dataset = pd.read_csv(DATA_PATH+FILE_NAME, index_col=0).values
score_S = PCA_DB(dataset)
score_T = FR(dataset[:,-2])
end = time.time()
result_visualize(score_S,score_T,dataset )
print("time spend="+str((end-start)*1000)+" milisecond")
if __name__ == '__main__':
#excute model
main()
```
## Future work
* Better combination of hybrid model
* Rule based postproseccor