ad: build full pipeline

thatgeeman · May 19, 2024 · 75310e1 · 75310e1
1 parent a828296
commit 75310e1
Show file tree

Hide file tree

Showing 9 changed files with 710 additions and 657 deletions.
diff --git a/nbs/01_vae.ipynb b/nbs/01_vae.ipynb
diff --git a/nbs/02_lstm.ipynb b/nbs/02_lstm.ipynb
diff --git a/nbs/03_ad_complete.ipynb b/nbs/03_ad_complete.ipynb
diff --git a/nbs/index.ipynb b/nbs/index.ipynb
diff --git a/sample_data/result.png b/sample_data/result.png
diff --git a/ts_vae_lstm/_modidx.py b/ts_vae_lstm/_modidx.py
@@ -5,7 +5,11 @@
                 'doc_host': 'https://thatgeeman.github.io',
                 'git_url': 'https://github.com/thatgeeman/ts_vae_lstm',
                 'lib_path': 'ts_vae_lstm'},
-  'syms': { 'ts_vae_lstm.ad_complete': {},
+  'syms': { 'ts_vae_lstm.ad_complete': { 'ts_vae_lstm.ad_complete.AD': ('ad_complete.html#ad', 'ts_vae_lstm/ad_complete.py'),
+                                         'ts_vae_lstm.ad_complete.predict_next_embeddings': ( 'ad_complete.html#predict_next_embeddings',
+                                                                                              'ts_vae_lstm/ad_complete.py'),
+                                         'ts_vae_lstm.ad_complete.reconstruct_ts': ( 'ad_complete.html#reconstruct_ts',
+                                                                                     'ts_vae_lstm/ad_complete.py')},
             'ts_vae_lstm.concepts': {'ts_vae_lstm.concepts.get_window': ('concepts.html#get_window', 'ts_vae_lstm/concepts.py')},
             'ts_vae_lstm.lstm': { 'ts_vae_lstm.lstm.LSTMModel': ('lstm.html#lstmmodel', 'ts_vae_lstm/lstm.py'),
                                   'ts_vae_lstm.lstm.LSTMModel.__init__': ('lstm.html#lstmmodel.__init__', 'ts_vae_lstm/lstm.py'),
@@ -15,6 +19,7 @@
                                                                                   'ts_vae_lstm/lstm.py'),
                                   'ts_vae_lstm.lstm.TSLSTMDataset.__init__': ('lstm.html#tslstmdataset.__init__', 'ts_vae_lstm/lstm.py'),
                                   'ts_vae_lstm.lstm.TSLSTMDataset.__len__': ('lstm.html#tslstmdataset.__len__', 'ts_vae_lstm/lstm.py'),
+                                  'ts_vae_lstm.lstm.concat_first_emb': ('lstm.html#concat_first_emb', 'ts_vae_lstm/lstm.py'),
                                   'ts_vae_lstm.lstm.get_embeddings': ('lstm.html#get_embeddings', 'ts_vae_lstm/lstm.py')},
             'ts_vae_lstm.vae': { 'ts_vae_lstm.vae.Decoder': ('vae.html#decoder', 'ts_vae_lstm/vae.py'),
                                  'ts_vae_lstm.vae.Decoder.__init__': ('vae.html#decoder.__init__', 'ts_vae_lstm/vae.py'),

diff --git a/ts_vae_lstm/ad_complete.py b/ts_vae_lstm/ad_complete.py
@@ -1,15 +1,15 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/03_ad_complete.ipynb.
 
 # %% auto 0
-__all__ = ['BASEDIR', 'MODELDIR', 'VAE_MODEL', 'LSTM_MODEL']
+__all__ = ['BASEDIR', 'MODELDIR', 'VAE_MODEL', 'LSTM_MODEL', 'df', 'predict_next_embeddings', 'reconstruct_ts', 'AD']
 
 # %% ../nbs/03_ad_complete.ipynb 3
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from datetime import datetime as dt
-from .concepts import get_window
+
 from scipy import signal
 import os
 import math
@@ -24,7 +24,9 @@
 
 # %% ../nbs/03_ad_complete.ipynb 5
 from .vae import VAE, Encoder, Decoder, StochasticSampler
-from fastcore.xtras import noop
+from .lstm import get_embeddings, LSTMModel, concat_first_emb
+from .concepts import get_window
+from fastcore.xtras import noop, flatten
 from dotenv import load_dotenv
 
 # %% ../nbs/03_ad_complete.ipynb 6
@@ -36,3 +38,104 @@
 LSTM_MODEL = os.getenv("LSTM_MODEL")
 
 print(f"LSTM model: {LSTM_MODEL}\nVAE model: {VAE_MODEL}")
+
+# %% ../nbs/03_ad_complete.ipynb 15
+df = pd.DataFrame(data_test_norm, index=data["t_test"], columns=["value"])
+df.head(2)
+
+# %% ../nbs/03_ad_complete.ipynb 24
+@torch.no_grad()
+def predict_next_embeddings(emb, lstm_model):
+    lstm_model.eval()
+    next_emb = lstm_model(emb)
+    return next_emb
+
+
+@torch.no_grad()
+def reconstruct_ts(emb, vae_model):
+    vae_model.eval()
+    ts = vae_model.decoder(emb)
+    return ts
+
+# %% ../nbs/03_ad_complete.ipynb 25
+@torch.no_grad()
+def AD(
+    past_window,
+    vae_model,
+    lstm_model,
+    window_size=48,
+    threshold=0.8,
+    latent_dim=24,
+    reconstruct_with_true=False,
+    stats={"lstm": (0, 1), "vae": (0, 1)},
+    normalize=True,
+    steps=None,
+    # device="cpu",
+):
+    steps = np.asanyarray(steps)
+    if normalize:
+        past_window = (past_window - stats["vae"][0]) / stats["vae"][1]
+    remainder = len(past_window) % window_size
+    if remainder > 0:
+        print(
+            f"past window not a multiple of window_size! Adjusting for {remainder} extra points."
+        )
+        past_window = past_window[remainder:]
+        steps = steps[remainder:]
+    # start
+    n_windows = int(len(past_window) / window_size)
+    print(f"Number of windows from time series: {n_windows}")
+    ad_score = np.zeros(n_windows)
+    ad_status = np.zeros((n_windows, window_size))
+    ad_window_loc = np.zeros((n_windows, window_size))
+    # window = torch.as_tensor(window).to(device=device).unsqueeze(0)
+    # print(window.shape)
+    emb = get_embeddings(
+        past_window, n_windows=n_windows, vae_model=vae_model, latent_dim=latent_dim
+    )  # encode 48 steps to 24 embeddings
+    n_features = emb.size(-1)
+    emb = emb.reshape(1, n_windows, latent_dim, n_features)
+    # standardize with training stats
+    # print(emb.shape)
+    ts = np.zeros((n_windows, window_size))
+    past_window = past_window.reshape(ts.shape)
+    steps = steps.reshape(ts.shape)
+    for widx in range(n_windows):
+        emb_idx = emb[:, widx, :, :]
+        if normalize:
+            emb_idx = (emb_idx - stats["lstm"][0]) / stats["lstm"][1]
+        # pass embeddings from 1 to 23, predict 2 to 24
+        next_emb = predict_next_embeddings(emb_idx[:, :-1, :], lstm_model=lstm_model)
+        # print(next_emb.shape)
+        # now can use either the true embeddings for 1-23 and append last prediction of emb
+        # or take true embeddings for 1 and prepend to predicted 2-24 emb
+        if reconstruct_with_true:
+            next_emb = concat_first_emb(
+                next_emb[:, -1, :].unsqueeze(0), first_emb=emb_idx[:, :-1, :], dim=1
+            )
+        else:
+            next_emb = concat_first_emb(
+                next_emb, first_emb=emb_idx[:, 0, :].unsqueeze(0), dim=1
+            )
+        next_emb = next_emb[:, :, 0]  # last index not need for decoder
+        # VAE decoder
+        ts_widx = reconstruct_ts(next_emb, vae_model=vae_model)
+        ts_widx = ts_widx.squeeze().detach().numpy()
+        ts[widx, :] = ts_widx
+        # calculate for this window the anomaly score
+        ad_score[widx] = np.linalg.norm(past_window[widx] - ts_widx, 2)
+        ad_status[widx, :] = ad_score[widx] > threshold  # global label for all items
+        ad_window_loc[widx, :] = steps[widx, :]
+
+    if normalize:
+        ts = ts * stats["vae"][1] + stats["vae"][0]
+        past_window = past_window * stats["vae"][1] + stats["vae"][0]
+
+    return {
+        "reconstructed": ts,
+        "actual": past_window,
+        "score": ad_score,
+        "status": ad_status,
+        "steps": steps,
+        "threshold": threshold,
+    }
diff --git a/ts_vae_lstm/lstm.py b/ts_vae_lstm/lstm.py
@@ -1,7 +1,7 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_lstm.ipynb.
 
 # %% auto 0
-__all__ = ['BASEDIR', 'MODELDIR', 'VAE_MODEL', 'get_embeddings', 'TSLSTMDataset', 'LSTMModel']
+__all__ = ['BASEDIR', 'MODELDIR', 'VAE_MODEL', 'get_embeddings', 'TSLSTMDataset', 'LSTMModel', 'concat_first_emb']
 
 # %% ../nbs/02_lstm.ipynb 3
 from dotenv import load_dotenv
@@ -35,7 +35,9 @@
 
 # %% ../nbs/02_lstm.ipynb 16
 @torch.no_grad()
-def get_embeddings(x, n_windows=1, latent_dim=32, seq_len=1, sampler_repeat=200):
+def get_embeddings(
+    x, vae_model=None, n_windows=1, latent_dim=32, seq_len=1, sampler_repeat=200
+):
     """
     _summary_
 
@@ -58,6 +60,7 @@ def get_embeddings(x, n_windows=1, latent_dim=32, seq_len=1, sampler_repeat=200)
         _description_
     """
     # actual_shape = x.shape[0]
+    assert vae_model, "VAE model missing"
     vae_model.eval()
     x = torch.from_numpy(x.astype(np.float32)).view(
         n_windows, -1, seq_len
@@ -184,3 +187,8 @@ def forward(self, x):
 
 # %% ../nbs/02_lstm.ipynb 62
 from fastcore.xtras import partial
+
+# %% ../nbs/02_lstm.ipynb 72
+def concat_first_emb(y_emb, first_emb, dim=0):
+    """Concat first_emb to y_emb"""
+    return torch.concat([first_emb, y_emb], dim=dim)
diff --git a/ts_vae_lstm/vae.py b/ts_vae_lstm/vae.py
@@ -33,17 +33,17 @@
 for k in data.keys():
     print(k)
 
-# %% ../nbs/01_vae.ipynb 15
+# %% ../nbs/01_vae.ipynb 17
 df = pd.DataFrame(data["training"], index=data["t_train"], columns=["value"])
 df.head(2)
 
-# %% ../nbs/01_vae.ipynb 20
+# %% ../nbs/01_vae.ipynb 22
 p = 48  # past sequences at time t. 48 steps = a day
 end_steps = [es for es in range(p, len(df), 1)]
 # step is one since we want overlapping windows for VAE training
 len(end_steps), end_steps[:3], end_steps[-3:]
 
-# %% ../nbs/01_vae.ipynb 21
+# %% ../nbs/01_vae.ipynb 23
 data_windowed = [
     {
         "subset": get_window(
@@ -59,46 +59,44 @@
     for t in end_steps
 ]
 
-# %% ../nbs/01_vae.ipynb 24
+# %% ../nbs/01_vae.ipynb 26
 split_ratio = 0.2
 val_data_idxs = np.random.choice(
     range(len(data_windowed)), size=int(split_ratio * len(data_windowed)), replace=False
 )
 trn_data_idxs = [idx for idx in range(len(data_windowed)) if idx not in val_data_idxs]
 len(val_data_idxs), len(trn_data_idxs)
 
-# %% ../nbs/01_vae.ipynb 25
+# %% ../nbs/01_vae.ipynb 27
 from torch import nn
 import torch.nn.functional as F
 import torch
 from torch.utils.data import Dataset, DataLoader
 from sklearn.metrics import roc_auc_score, accuracy_score
 
-# %% ../nbs/01_vae.ipynb 26
+# %% ../nbs/01_vae.ipynb 28
 val_data = [data_windowed[idx] for idx in val_data_idxs]
 trn_data = [data_windowed[idx] for idx in trn_data_idxs]
 
-# %% ../nbs/01_vae.ipynb 29
+# %% ../nbs/01_vae.ipynb 31
 n_features = trn_data[0]["subset"].shape[1]  # - 1
 n_features
 
-# %% ../nbs/01_vae.ipynb 31
+# %% ../nbs/01_vae.ipynb 33
 means = np.zeros((len(trn_data), n_features))  # ((len(trn_data), 4))
 stds = np.zeros((len(trn_data), n_features))  # ((len(trn_data), 4))
 slice_from = n_features - 1
-"""
 for i, _trn_data in enumerate(trn_data):
     means[i] = (np.mean(_trn_data["subset"][:, slice_from:], axis=0)).astype(np.float32)
     stds[i] = (np.var(_trn_data["subset"][:, slice_from:], axis=0) ** 0.5).astype(
         np.float32
     )
-"""
 means = means.mean(0)
 stds = stds.mean(0)
 
 means, stds
 
-# %% ../nbs/01_vae.ipynb 34
+# %% ../nbs/01_vae.ipynb 36
 class TSDataset(Dataset):
     def __init__(self, data, mean, std):
         self.data = data
@@ -115,15 +113,15 @@ def __getitem__(self, idx):
     def __len__(self):
         return len(self.data)
 
-# %% ../nbs/01_vae.ipynb 35
+# %% ../nbs/01_vae.ipynb 37
 dset_trn = TSDataset(trn_data, mean=means, std=stds)
 dset_val = TSDataset(val_data, mean=means, std=stds)
 # use same stats from training data
 
-# %% ../nbs/01_vae.ipynb 38
+# %% ../nbs/01_vae.ipynb 40
 batch_size = 8
 
-# %% ../nbs/01_vae.ipynb 39
+# %% ../nbs/01_vae.ipynb 41
 dl_trn = DataLoader(
     dataset=dset_trn,
     batch_size=batch_size,
@@ -139,7 +137,7 @@ def __len__(self):
     num_workers=num_workers,
 )
 
-# %% ../nbs/01_vae.ipynb 45
+# %% ../nbs/01_vae.ipynb 47
 # encoder
 # l_win to 24, the model would consider each 24-hour period as one sequence.
 # pad: if your array is [1, 2, 3] and you symmetrically pad it with 1 unit, the result would be [2, 1, 2, 3, 2].
@@ -208,7 +206,7 @@ def init_weights(self):
                 torch.nn.init.xavier_normal_(m.weight)
                 m.bias.data.fill_(0)
 
-# %% ../nbs/01_vae.ipynb 49
+# %% ../nbs/01_vae.ipynb 51
 class StochasticSampler(nn.Module):
     """We basically want to parametrize the sampling from the latent space"""
 
@@ -231,7 +229,7 @@ def forward(self, z_mean, z_log_var):
             else (z_mean + torch.exp(0.5 * z_log_var) * eps)
         )
 
-# %% ../nbs/01_vae.ipynb 55
+# %% ../nbs/01_vae.ipynb 57
 # l_win to 24, the model would consider each 24-hour period as one sequence.
 # pad: if your array is [1, 2, 3] and you symmetrically pad it with 1 unit, the result would be [2, 1, 2, 3, 2].
 # xavier_initializer()
@@ -318,7 +316,7 @@ def init_weights(self):
                 torch.nn.init.xavier_normal_(m.weight)
                 m.bias.data.fill_(0)
 
-# %% ../nbs/01_vae.ipynb 59
+# %% ../nbs/01_vae.ipynb 61
 class VAE(nn.Module):
     def __init__(
         self, input_shape, latent_dim=20, act=F.leaky_relu, deterministic=False
@@ -346,7 +344,7 @@ def forward(self, x):
         vae_loss = reconstruction_loss + loss_kl
         return reconstructed_x, vae_loss
 
-# %% ../nbs/01_vae.ipynb 64
+# %% ../nbs/01_vae.ipynb 66
 @torch.no_grad()
 def evaluate_reconstruction(original_signal, reconstructed_signal):
     """
@@ -382,11 +380,11 @@ def evaluate_reconstruction(original_signal, reconstructed_signal):
         "mae": mae,
     }
 
-# %% ../nbs/01_vae.ipynb 66
+# %% ../nbs/01_vae.ipynb 68
 device = "cuda" if torch.cuda.is_available() else "cpu"
 device
 
-# %% ../nbs/01_vae.ipynb 67
+# %% ../nbs/01_vae.ipynb 69
 def validate_epoch(dls, scorer):
     """For the full dataloader, calculate the running loss and score"""
     model.eval()
@@ -406,5 +404,5 @@ def validate_epoch(dls, scorer):
             running_score += score
     return running_loss / len(dls), running_score / len(dls)
 
-# %% ../nbs/01_vae.ipynb 68
+# %% ../nbs/01_vae.ipynb 70
 from fastcore.xtras import partial