Skip to content

Commit

Permalink
ad: build full pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
thatgeeman committed May 19, 2024
1 parent a828296 commit 75310e1
Show file tree
Hide file tree
Showing 9 changed files with 710 additions and 657 deletions.
124 changes: 79 additions & 45 deletions nbs/01_vae.ipynb

Large diffs are not rendered by default.

250 changes: 139 additions & 111 deletions nbs/02_lstm.ipynb

Large diffs are not rendered by default.

777 changes: 315 additions & 462 deletions nbs/03_ad_complete.ipynb

Large diffs are not rendered by default.

46 changes: 35 additions & 11 deletions nbs/index.ipynb

Large diffs are not rendered by default.

Binary file added sample_data/result.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 6 additions & 1 deletion ts_vae_lstm/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
'doc_host': 'https://thatgeeman.github.io',
'git_url': 'https://github.com/thatgeeman/ts_vae_lstm',
'lib_path': 'ts_vae_lstm'},
'syms': { 'ts_vae_lstm.ad_complete': {},
'syms': { 'ts_vae_lstm.ad_complete': { 'ts_vae_lstm.ad_complete.AD': ('ad_complete.html#ad', 'ts_vae_lstm/ad_complete.py'),
'ts_vae_lstm.ad_complete.predict_next_embeddings': ( 'ad_complete.html#predict_next_embeddings',
'ts_vae_lstm/ad_complete.py'),
'ts_vae_lstm.ad_complete.reconstruct_ts': ( 'ad_complete.html#reconstruct_ts',
'ts_vae_lstm/ad_complete.py')},
'ts_vae_lstm.concepts': {'ts_vae_lstm.concepts.get_window': ('concepts.html#get_window', 'ts_vae_lstm/concepts.py')},
'ts_vae_lstm.lstm': { 'ts_vae_lstm.lstm.LSTMModel': ('lstm.html#lstmmodel', 'ts_vae_lstm/lstm.py'),
'ts_vae_lstm.lstm.LSTMModel.__init__': ('lstm.html#lstmmodel.__init__', 'ts_vae_lstm/lstm.py'),
Expand All @@ -15,6 +19,7 @@
'ts_vae_lstm/lstm.py'),
'ts_vae_lstm.lstm.TSLSTMDataset.__init__': ('lstm.html#tslstmdataset.__init__', 'ts_vae_lstm/lstm.py'),
'ts_vae_lstm.lstm.TSLSTMDataset.__len__': ('lstm.html#tslstmdataset.__len__', 'ts_vae_lstm/lstm.py'),
'ts_vae_lstm.lstm.concat_first_emb': ('lstm.html#concat_first_emb', 'ts_vae_lstm/lstm.py'),
'ts_vae_lstm.lstm.get_embeddings': ('lstm.html#get_embeddings', 'ts_vae_lstm/lstm.py')},
'ts_vae_lstm.vae': { 'ts_vae_lstm.vae.Decoder': ('vae.html#decoder', 'ts_vae_lstm/vae.py'),
'ts_vae_lstm.vae.Decoder.__init__': ('vae.html#decoder.__init__', 'ts_vae_lstm/vae.py'),
Expand Down
109 changes: 106 additions & 3 deletions ts_vae_lstm/ad_complete.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/03_ad_complete.ipynb.

# %% auto 0
__all__ = ['BASEDIR', 'MODELDIR', 'VAE_MODEL', 'LSTM_MODEL']
__all__ = ['BASEDIR', 'MODELDIR', 'VAE_MODEL', 'LSTM_MODEL', 'df', 'predict_next_embeddings', 'reconstruct_ts', 'AD']

# %% ../nbs/03_ad_complete.ipynb 3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from .concepts import get_window

from scipy import signal
import os
import math
Expand All @@ -24,7 +24,9 @@

# %% ../nbs/03_ad_complete.ipynb 5
from .vae import VAE, Encoder, Decoder, StochasticSampler
from fastcore.xtras import noop
from .lstm import get_embeddings, LSTMModel, concat_first_emb
from .concepts import get_window
from fastcore.xtras import noop, flatten
from dotenv import load_dotenv

# %% ../nbs/03_ad_complete.ipynb 6
Expand All @@ -36,3 +38,104 @@
LSTM_MODEL = os.getenv("LSTM_MODEL")

print(f"LSTM model: {LSTM_MODEL}\nVAE model: {VAE_MODEL}")

# %% ../nbs/03_ad_complete.ipynb 15
df = pd.DataFrame(data_test_norm, index=data["t_test"], columns=["value"])
df.head(2)

# %% ../nbs/03_ad_complete.ipynb 24
@torch.no_grad()
def predict_next_embeddings(emb, lstm_model):
lstm_model.eval()
next_emb = lstm_model(emb)
return next_emb


@torch.no_grad()
def reconstruct_ts(emb, vae_model):
vae_model.eval()
ts = vae_model.decoder(emb)
return ts

# %% ../nbs/03_ad_complete.ipynb 25
@torch.no_grad()
def AD(
past_window,
vae_model,
lstm_model,
window_size=48,
threshold=0.8,
latent_dim=24,
reconstruct_with_true=False,
stats={"lstm": (0, 1), "vae": (0, 1)},
normalize=True,
steps=None,
# device="cpu",
):
steps = np.asanyarray(steps)
if normalize:
past_window = (past_window - stats["vae"][0]) / stats["vae"][1]
remainder = len(past_window) % window_size
if remainder > 0:
print(
f"past window not a multiple of window_size! Adjusting for {remainder} extra points."
)
past_window = past_window[remainder:]
steps = steps[remainder:]
# start
n_windows = int(len(past_window) / window_size)
print(f"Number of windows from time series: {n_windows}")
ad_score = np.zeros(n_windows)
ad_status = np.zeros((n_windows, window_size))
ad_window_loc = np.zeros((n_windows, window_size))
# window = torch.as_tensor(window).to(device=device).unsqueeze(0)
# print(window.shape)
emb = get_embeddings(
past_window, n_windows=n_windows, vae_model=vae_model, latent_dim=latent_dim
) # encode 48 steps to 24 embeddings
n_features = emb.size(-1)
emb = emb.reshape(1, n_windows, latent_dim, n_features)
# standardize with training stats
# print(emb.shape)
ts = np.zeros((n_windows, window_size))
past_window = past_window.reshape(ts.shape)
steps = steps.reshape(ts.shape)
for widx in range(n_windows):
emb_idx = emb[:, widx, :, :]
if normalize:
emb_idx = (emb_idx - stats["lstm"][0]) / stats["lstm"][1]
# pass embeddings from 1 to 23, predict 2 to 24
next_emb = predict_next_embeddings(emb_idx[:, :-1, :], lstm_model=lstm_model)
# print(next_emb.shape)
# now can use either the true embeddings for 1-23 and append last prediction of emb
# or take true embeddings for 1 and prepend to predicted 2-24 emb
if reconstruct_with_true:
next_emb = concat_first_emb(
next_emb[:, -1, :].unsqueeze(0), first_emb=emb_idx[:, :-1, :], dim=1
)
else:
next_emb = concat_first_emb(
next_emb, first_emb=emb_idx[:, 0, :].unsqueeze(0), dim=1
)
next_emb = next_emb[:, :, 0] # last index not need for decoder
# VAE decoder
ts_widx = reconstruct_ts(next_emb, vae_model=vae_model)
ts_widx = ts_widx.squeeze().detach().numpy()
ts[widx, :] = ts_widx
# calculate for this window the anomaly score
ad_score[widx] = np.linalg.norm(past_window[widx] - ts_widx, 2)
ad_status[widx, :] = ad_score[widx] > threshold # global label for all items
ad_window_loc[widx, :] = steps[widx, :]

if normalize:
ts = ts * stats["vae"][1] + stats["vae"][0]
past_window = past_window * stats["vae"][1] + stats["vae"][0]

return {
"reconstructed": ts,
"actual": past_window,
"score": ad_score,
"status": ad_status,
"steps": steps,
"threshold": threshold,
}
12 changes: 10 additions & 2 deletions ts_vae_lstm/lstm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_lstm.ipynb.

# %% auto 0
__all__ = ['BASEDIR', 'MODELDIR', 'VAE_MODEL', 'get_embeddings', 'TSLSTMDataset', 'LSTMModel']
__all__ = ['BASEDIR', 'MODELDIR', 'VAE_MODEL', 'get_embeddings', 'TSLSTMDataset', 'LSTMModel', 'concat_first_emb']

# %% ../nbs/02_lstm.ipynb 3
from dotenv import load_dotenv
Expand Down Expand Up @@ -35,7 +35,9 @@

# %% ../nbs/02_lstm.ipynb 16
@torch.no_grad()
def get_embeddings(x, n_windows=1, latent_dim=32, seq_len=1, sampler_repeat=200):
def get_embeddings(
x, vae_model=None, n_windows=1, latent_dim=32, seq_len=1, sampler_repeat=200
):
"""
_summary_
Expand All @@ -58,6 +60,7 @@ def get_embeddings(x, n_windows=1, latent_dim=32, seq_len=1, sampler_repeat=200)
_description_
"""
# actual_shape = x.shape[0]
assert vae_model, "VAE model missing"
vae_model.eval()
x = torch.from_numpy(x.astype(np.float32)).view(
n_windows, -1, seq_len
Expand Down Expand Up @@ -184,3 +187,8 @@ def forward(self, x):

# %% ../nbs/02_lstm.ipynb 62
from fastcore.xtras import partial

# %% ../nbs/02_lstm.ipynb 72
def concat_first_emb(y_emb, first_emb, dim=0):
"""Concat first_emb to y_emb"""
return torch.concat([first_emb, y_emb], dim=dim)
42 changes: 20 additions & 22 deletions ts_vae_lstm/vae.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,17 @@
for k in data.keys():
print(k)

# %% ../nbs/01_vae.ipynb 15
# %% ../nbs/01_vae.ipynb 17
df = pd.DataFrame(data["training"], index=data["t_train"], columns=["value"])
df.head(2)

# %% ../nbs/01_vae.ipynb 20
# %% ../nbs/01_vae.ipynb 22
p = 48 # past sequences at time t. 48 steps = a day
end_steps = [es for es in range(p, len(df), 1)]
# step is one since we want overlapping windows for VAE training
len(end_steps), end_steps[:3], end_steps[-3:]

# %% ../nbs/01_vae.ipynb 21
# %% ../nbs/01_vae.ipynb 23
data_windowed = [
{
"subset": get_window(
Expand All @@ -59,46 +59,44 @@
for t in end_steps
]

# %% ../nbs/01_vae.ipynb 24
# %% ../nbs/01_vae.ipynb 26
split_ratio = 0.2
val_data_idxs = np.random.choice(
range(len(data_windowed)), size=int(split_ratio * len(data_windowed)), replace=False
)
trn_data_idxs = [idx for idx in range(len(data_windowed)) if idx not in val_data_idxs]
len(val_data_idxs), len(trn_data_idxs)

# %% ../nbs/01_vae.ipynb 25
# %% ../nbs/01_vae.ipynb 27
from torch import nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score

# %% ../nbs/01_vae.ipynb 26
# %% ../nbs/01_vae.ipynb 28
val_data = [data_windowed[idx] for idx in val_data_idxs]
trn_data = [data_windowed[idx] for idx in trn_data_idxs]

# %% ../nbs/01_vae.ipynb 29
# %% ../nbs/01_vae.ipynb 31
n_features = trn_data[0]["subset"].shape[1] # - 1
n_features

# %% ../nbs/01_vae.ipynb 31
# %% ../nbs/01_vae.ipynb 33
means = np.zeros((len(trn_data), n_features)) # ((len(trn_data), 4))
stds = np.zeros((len(trn_data), n_features)) # ((len(trn_data), 4))
slice_from = n_features - 1
"""
for i, _trn_data in enumerate(trn_data):
means[i] = (np.mean(_trn_data["subset"][:, slice_from:], axis=0)).astype(np.float32)
stds[i] = (np.var(_trn_data["subset"][:, slice_from:], axis=0) ** 0.5).astype(
np.float32
)
"""
means = means.mean(0)
stds = stds.mean(0)

means, stds

# %% ../nbs/01_vae.ipynb 34
# %% ../nbs/01_vae.ipynb 36
class TSDataset(Dataset):
def __init__(self, data, mean, std):
self.data = data
Expand All @@ -115,15 +113,15 @@ def __getitem__(self, idx):
def __len__(self):
return len(self.data)

# %% ../nbs/01_vae.ipynb 35
# %% ../nbs/01_vae.ipynb 37
dset_trn = TSDataset(trn_data, mean=means, std=stds)
dset_val = TSDataset(val_data, mean=means, std=stds)
# use same stats from training data

# %% ../nbs/01_vae.ipynb 38
# %% ../nbs/01_vae.ipynb 40
batch_size = 8

# %% ../nbs/01_vae.ipynb 39
# %% ../nbs/01_vae.ipynb 41
dl_trn = DataLoader(
dataset=dset_trn,
batch_size=batch_size,
Expand All @@ -139,7 +137,7 @@ def __len__(self):
num_workers=num_workers,
)

# %% ../nbs/01_vae.ipynb 45
# %% ../nbs/01_vae.ipynb 47
# encoder
# l_win to 24, the model would consider each 24-hour period as one sequence.
# pad: if your array is [1, 2, 3] and you symmetrically pad it with 1 unit, the result would be [2, 1, 2, 3, 2].
Expand Down Expand Up @@ -208,7 +206,7 @@ def init_weights(self):
torch.nn.init.xavier_normal_(m.weight)
m.bias.data.fill_(0)

# %% ../nbs/01_vae.ipynb 49
# %% ../nbs/01_vae.ipynb 51
class StochasticSampler(nn.Module):
"""We basically want to parametrize the sampling from the latent space"""

Expand All @@ -231,7 +229,7 @@ def forward(self, z_mean, z_log_var):
else (z_mean + torch.exp(0.5 * z_log_var) * eps)
)

# %% ../nbs/01_vae.ipynb 55
# %% ../nbs/01_vae.ipynb 57
# l_win to 24, the model would consider each 24-hour period as one sequence.
# pad: if your array is [1, 2, 3] and you symmetrically pad it with 1 unit, the result would be [2, 1, 2, 3, 2].
# xavier_initializer()
Expand Down Expand Up @@ -318,7 +316,7 @@ def init_weights(self):
torch.nn.init.xavier_normal_(m.weight)
m.bias.data.fill_(0)

# %% ../nbs/01_vae.ipynb 59
# %% ../nbs/01_vae.ipynb 61
class VAE(nn.Module):
def __init__(
self, input_shape, latent_dim=20, act=F.leaky_relu, deterministic=False
Expand Down Expand Up @@ -346,7 +344,7 @@ def forward(self, x):
vae_loss = reconstruction_loss + loss_kl
return reconstructed_x, vae_loss

# %% ../nbs/01_vae.ipynb 64
# %% ../nbs/01_vae.ipynb 66
@torch.no_grad()
def evaluate_reconstruction(original_signal, reconstructed_signal):
"""
Expand Down Expand Up @@ -382,11 +380,11 @@ def evaluate_reconstruction(original_signal, reconstructed_signal):
"mae": mae,
}

# %% ../nbs/01_vae.ipynb 66
# %% ../nbs/01_vae.ipynb 68
device = "cuda" if torch.cuda.is_available() else "cpu"
device

# %% ../nbs/01_vae.ipynb 67
# %% ../nbs/01_vae.ipynb 69
def validate_epoch(dls, scorer):
"""For the full dataloader, calculate the running loss and score"""
model.eval()
Expand All @@ -406,5 +404,5 @@ def validate_epoch(dls, scorer):
running_score += score
return running_loss / len(dls), running_score / len(dls)

# %% ../nbs/01_vae.ipynb 68
# %% ../nbs/01_vae.ipynb 70
from fastcore.xtras import partial

0 comments on commit 75310e1

Please sign in to comment.