Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add validation steps to embedders #332

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 55 additions & 13 deletions srai/embedders/gtfs2vec/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import json
from functools import reduce
from pathlib import Path
from typing import Any, Dict, List, Optional, Type, Union
from typing import Any, Dict, List, Optional, Tuple, Type, Union

import geopandas as gpd
import numpy as np
Expand Down Expand Up @@ -84,6 +84,9 @@ def fit(
regions_gdf: gpd.GeoDataFrame,
features_gdf: gpd.GeoDataFrame,
joint_gdf: gpd.GeoDataFrame,
val_regions_gdf: Optional[gpd.GeoDataFrame] = None,
val_features_gdf: Optional[gpd.GeoDataFrame] = None,
val_joint_gdf: Optional[gpd.GeoDataFrame] = None,
) -> None:
"""
Fit model to a given data.
Expand All @@ -92,23 +95,29 @@ def fit(
regions_gdf (gpd.GeoDataFrame): Region indexes and geometries.
features_gdf (gpd.GeoDataFrame): Feature indexes, geometries and feature values.
joint_gdf (gpd.GeoDataFrame): Joiner result with region-feature multi-index.
val_regions_gdf: (Optional[gpd.GeoDataFrame], optional): Validation region indexes.
val_features_gdf: (Optional[gpd.GeoDataFrame], optional): Validation feature indexes.
val_joint_gdf: (Optional[gpd.GeoDataFrame], optional): Validation joiner result.

Raises:
ValueError: If any of the gdfs index names is None.
ValueError: If joint_gdf.index is not of type pd.MultiIndex or doesn't have 2 levels.
ValueError: If index levels in gdfs don't overlap correctly.
"""
self._validate_indexes(regions_gdf, features_gdf, joint_gdf)
features = self._prepare_features(regions_gdf, features_gdf, joint_gdf)

features, val_features = self._prepare_train_val_features(
regions_gdf, features_gdf, joint_gdf, val_regions_gdf, val_features_gdf, val_joint_gdf
)
if not self._skip_autoencoder:
self._model = self._train_model_unsupervised(features)
self._model = self._train_model_unsupervised(features, val_features)

def fit_transform(
self,
regions_gdf: gpd.GeoDataFrame,
features_gdf: gpd.GeoDataFrame,
joint_gdf: gpd.GeoDataFrame,
val_regions_gdf: Optional[gpd.GeoDataFrame] = None,
val_features_gdf: Optional[gpd.GeoDataFrame] = None,
val_joint_gdf: Optional[gpd.GeoDataFrame] = None,
) -> pd.DataFrame:
"""
Fit model and transform a given data.
Expand All @@ -117,6 +126,9 @@ def fit_transform(
regions_gdf (gpd.GeoDataFrame): Region indexes and geometries.
features_gdf (gpd.GeoDataFrame): Feature indexes, geometries and feature values.
joint_gdf (gpd.GeoDataFrame): Joiner result with region-feature multi-index.
val_regions_gdf: (Optional[gpd.GeoDataFrame], optional): Validation region indexes.
val_features_gdf: (Optional[gpd.GeoDataFrame], optional): Validation feature indexes.
val_joint_gdf: (Optional[gpd.GeoDataFrame], optional): Validation joiner result.

Returns:
pd.DataFrame: Embedding and geometry index for each region in regions_gdf.
Expand All @@ -126,13 +138,13 @@ def fit_transform(
ValueError: If joint_gdf.index is not of type pd.MultiIndex or doesn't have 2 levels.
ValueError: If index levels in gdfs don't overlap correctly.
"""
self._validate_indexes(regions_gdf, features_gdf, joint_gdf)
features = self._prepare_features(regions_gdf, features_gdf, joint_gdf)

features, val_features = self._prepare_train_val_features(
regions_gdf, features_gdf, joint_gdf, val_regions_gdf, val_features_gdf, val_joint_gdf
)
if self._skip_autoencoder:
return features
else:
self._model = self._train_model_unsupervised(features)
self._model = self._train_model_unsupervised(features, val_features)
return self._embed(features)

def _maybe_get_model(self) -> GTFS2VecModel:
Expand All @@ -141,6 +153,27 @@ def _maybe_get_model(self) -> GTFS2VecModel:
raise ModelNotFitException("Model not fit! Run fit() or fit_transform() first.")
return self._model

def _prepare_train_val_features(
self,
regions_gdf: gpd.GeoDataFrame,
features_gdf: gpd.GeoDataFrame,
joint_gdf: gpd.GeoDataFrame,
val_regions_gdf: Optional[gpd.GeoDataFrame] = None,
val_features_gdf: Optional[gpd.GeoDataFrame] = None,
val_joint_gdf: Optional[gpd.GeoDataFrame] = None,
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
self._validate_indexes(regions_gdf, features_gdf, joint_gdf)
features = self._prepare_features(regions_gdf, features_gdf, joint_gdf)
val_features = None
if (
val_regions_gdf is not None
and val_features_gdf is not None
and val_joint_gdf is not None
):
self._validate_indexes(val_regions_gdf, val_features_gdf, val_joint_gdf)
val_features = self._prepare_features(val_regions_gdf, val_features_gdf, val_joint_gdf)
return features, val_features

def _prepare_features(
self,
regions_gdf: gpd.GeoDataFrame,
Expand Down Expand Up @@ -227,12 +260,15 @@ def _normalize_features(self, features: pd.DataFrame) -> pd.DataFrame:

return features

def _train_model_unsupervised(self, features: pd.DataFrame) -> GTFS2VecModel:
def _train_model_unsupervised(
self, features: pd.DataFrame, val_features: Optional[pd.DataFrame]
) -> GTFS2VecModel:
"""
Train model unsupervised.

Args:
features (pd.DataFrame): Features.
val_features (Optional[pd.DataFrame]): Validation features.
"""
import pytorch_lightning as pl
from torch.utils.data import DataLoader
Expand All @@ -242,11 +278,17 @@ def _train_model_unsupervised(self, features: pd.DataFrame) -> GTFS2VecModel:
n_hidden=self._hidden_size,
n_embed=self._embedding_size,
)
X = features.to_numpy().astype(np.float32)
x_dataloader = DataLoader(X, batch_size=24, shuffle=True, num_workers=4)
train_x = features.to_numpy().astype(np.float32)
train_x_dataloader = DataLoader(train_x, batch_size=24, shuffle=True, num_workers=4)

val_x_dataloader = None
if val_features is not None:
val_x = val_features.to_numpy().astype(np.float32)
val_x_dataloader = DataLoader(val_x, batch_size=24, shuffle=True, num_workers=4)

trainer = pl.Trainer(max_epochs=10)

trainer.fit(model, x_dataloader)
trainer.fit(model, train_x_dataloader, val_x_dataloader)

return model

Expand Down
17 changes: 17 additions & 0 deletions srai/embedders/gtfs2vec/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,20 @@ def training_step(self, batch: "torch.Tensor", batch_idx: Any) -> "torch.Tensor"
loss = F.mse_loss(x_hat, x)
self.log("train_loss", loss)
return loss

def validation_step(self, batch: "torch.Tensor", batch_idx: Any) -> "torch.Tensor":
"""
Validation step.

Args:
batch (torch.Tensor): Batch.
batch_idx (Any): Batch index.
"""
from torch.nn import functional as F

x = batch
z = self.encoder(x)
x_hat = self.decoder(z)
loss = F.mse_loss(x_hat, x)
self.log("val_loss", loss)
return loss
49 changes: 46 additions & 3 deletions srai/embedders/hex2vec/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ def fit(
features_gdf: gpd.GeoDataFrame,
joint_gdf: gpd.GeoDataFrame,
neighbourhood: Neighbourhood[T],
val_regions_gdf: Optional[gpd.GeoDataFrame] = None,
val_features_gdf: Optional[gpd.GeoDataFrame] = None,
val_joint_gdf: Optional[gpd.GeoDataFrame] = None,
val_neighbourhood: Optional[Neighbourhood[T]] = None,
negative_sample_k_distance: int = 2,
batch_size: int = 32,
learning_rate: float = 0.001,
Expand All @@ -106,6 +110,14 @@ def fit(
joint_gdf (gpd.GeoDataFrame): Joiner result with region-feature multi-index.
neighbourhood (Neighbourhood[T]): The neighbourhood to use.
Should be intialized with the same regions.
val_regions_gdf (Optional[gpd.GeoDataFrame], optional): Validation region indexes and
geometries. Defaults to None.
val_features_gdf (Optional[gpd.GeoDataFrame], optional): Validation feature indexes,
geometries and feature values. Defaults to None.
val_joint_gdf (Optional[gpd.GeoDataFrame], optional): Validation joiner result with
region-feature multi-index. Defaults to None.
val_neighbourhood (Optional[Neighbourhood[T]], optional): Validation neighbourhood.
Defaults to None.
negative_sample_k_distance (int, optional): When sampling negative samples,
sample from a distance > k. Defaults to 2.
batch_size (int, optional): Batch size. Defaults to 32.
Expand Down Expand Up @@ -133,11 +145,26 @@ def fit(
self._model = Hex2VecModel(
layer_sizes=[num_features, *self._encoder_sizes], learning_rate=learning_rate
)
dataset = NeighbourDataset(counts_df, neighbourhood, negative_sample_k_distance)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
train_dataset = NeighbourDataset(counts_df, neighbourhood, negative_sample_k_distance)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

if (
val_regions_gdf is not None
and val_features_gdf is not None
and val_joint_gdf is not None
and val_neighbourhood is not None
):
val_counts_df = self._get_raw_counts(val_regions_gdf, val_features_gdf, val_joint_gdf)
val_dataset = NeighbourDataset(
val_counts_df, val_neighbourhood, negative_sample_k_distance
)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
else:
val_dataloader = None

trainer = pl.Trainer(**trainer_kwargs)
trainer.fit(self._model, dataloader)

trainer.fit(self._model, train_dataloader, val_dataloader)
self._is_fitted = True

def fit_transform(
Expand All @@ -146,6 +173,10 @@ def fit_transform(
features_gdf: gpd.GeoDataFrame,
joint_gdf: gpd.GeoDataFrame,
neighbourhood: Neighbourhood[T],
val_regions_gdf: Optional[gpd.GeoDataFrame] = None,
val_features_gdf: Optional[gpd.GeoDataFrame] = None,
val_joint_gdf: Optional[gpd.GeoDataFrame] = None,
val_neighbourhood: Optional[Neighbourhood[T]] = None,
negative_sample_k_distance: int = 2,
batch_size: int = 32,
learning_rate: float = 0.001,
Expand All @@ -160,6 +191,14 @@ def fit_transform(
joint_gdf (gpd.GeoDataFrame): Joiner result with region-feature multi-index.
neighbourhood (Neighbourhood[T]): The neighbourhood to use.
Should be intialized with the same regions.
val_regions_gdf (Optional[gpd.GeoDataFrame], optional): Validation region indexes and
geometries. Defaults to None.
val_features_gdf (Optional[gpd.GeoDataFrame], optional): Validation feature indexes,
geometries and feature values. Defaults to None.
val_joint_gdf (Optional[gpd.GeoDataFrame], optional): Validation joiner result with
region-feature multi-index. Defaults to None.
val_neighbourhood (Optional[Neighbourhood[T]], optional): Validation neighbourhood.
Defaults to None.
negative_sample_k_distance (int, optional): When sampling negative samples,
sample from a distance > k. Defaults to 2.
batch_size (int, optional): Batch size. Defaults to 32.
Expand All @@ -181,6 +220,10 @@ def fit_transform(
features_gdf,
joint_gdf,
neighbourhood,
val_regions_gdf,
val_features_gdf,
val_joint_gdf,
val_neighbourhood,
negative_sample_k_distance,
batch_size,
learning_rate,
Expand Down
44 changes: 37 additions & 7 deletions srai/embedders/highway2vec/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ def fit(
regions_gdf: gpd.GeoDataFrame,
features_gdf: gpd.GeoDataFrame,
joint_gdf: gpd.GeoDataFrame,
val_regions_gdf: Optional[gpd.GeoDataFrame] = None,
val_features_gdf: Optional[gpd.GeoDataFrame] = None,
val_joint_gdf: Optional[gpd.GeoDataFrame] = None,
trainer_kwargs: Optional[Dict[str, Any]] = None,
dataloader_kwargs: Optional[Dict[str, Any]] = None,
) -> None:
Expand All @@ -91,6 +94,9 @@ def fit(
regions_gdf (gpd.GeoDataFrame): Region indexes and geometries.
features_gdf (gpd.GeoDataFrame): Feature indexes, geometries and feature values.
joint_gdf (gpd.GeoDataFrame): Joiner result with region-feature multi-index.
val_regions_gdf: (Optional[gpd.GeoDataFrame], optional): Validation region indexes.
val_features_gdf: (Optional[gpd.GeoDataFrame], optional): Validation feature indexes.
val_joint_gdf: (Optional[gpd.GeoDataFrame], optional): Validation joiner result.
trainer_kwargs (Optional[Dict[str, Any]], optional): Trainer kwargs. Defaults to None.
dataloader_kwargs (Optional[Dict[str, Any]], optional): Dataloader kwargs.
Defaults to None.
Expand All @@ -104,33 +110,45 @@ def fit(
import torch
from torch.utils.data import DataLoader

dataloader_kwargs = dataloader_kwargs or {}
if "batch_size" not in dataloader_kwargs:
dataloader_kwargs["batch_size"] = 128

self._validate_indexes(regions_gdf, features_gdf, joint_gdf)
features_df = self._remove_geometry_if_present(features_gdf)
dataloader = DataLoader(torch.Tensor(features_df.values), **dataloader_kwargs)

num_features = len(features_df.columns)
self._model = Highway2VecModel(
n_features=num_features, n_hidden=self._hidden_size, n_embed=self._embedding_size
)

dataloader_kwargs = dataloader_kwargs or {}
if "batch_size" not in dataloader_kwargs:
dataloader_kwargs["batch_size"] = 128

dataloader = DataLoader(torch.Tensor(features_df.values), **dataloader_kwargs)
val_dataloader = None
if (
val_regions_gdf is not None
and val_features_gdf is not None
and val_joint_gdf is not None
):
self._validate_indexes(val_regions_gdf, val_features_gdf, val_joint_gdf)
val_features_df = self._remove_geometry_if_present(val_features_gdf)
val_dataloader = DataLoader(torch.Tensor(val_features_df.values), **dataloader_kwargs)

trainer_kwargs = trainer_kwargs or {}
if "max_epochs" not in trainer_kwargs:
trainer_kwargs["max_epochs"] = 10

trainer = pl.Trainer(**trainer_kwargs)
trainer.fit(self._model, dataloader)
trainer.fit(self._model, dataloader, val_dataloader)
self._is_fitted = True

def fit_transform(
self,
regions_gdf: gpd.GeoDataFrame,
features_gdf: gpd.GeoDataFrame,
joint_gdf: gpd.GeoDataFrame,
val_regions_gdf: Optional[gpd.GeoDataFrame] = None,
val_features_gdf: Optional[gpd.GeoDataFrame] = None,
val_joint_gdf: Optional[gpd.GeoDataFrame] = None,
trainer_kwargs: Optional[Dict[str, Any]] = None,
dataloader_kwargs: Optional[Dict[str, Any]] = None,
) -> pd.DataFrame:
Expand All @@ -141,6 +159,9 @@ def fit_transform(
regions_gdf (gpd.GeoDataFrame): Region indexes and geometries.
features_gdf (gpd.GeoDataFrame): Feature indexes, geometries and feature values.
joint_gdf (gpd.GeoDataFrame): Joiner result with region-feature multi-index.
val_regions_gdf: (Optional[gpd.GeoDataFrame], optional): Validation region indexes.
val_features_gdf: (Optional[gpd.GeoDataFrame], optional): Validation feature indexes.
val_joint_gdf: (Optional[gpd.GeoDataFrame], optional): Validation joiner result.
trainer_kwargs (Optional[Dict[str, Any]], optional): Trainer kwargs. Defaults to None.
dataloader_kwargs (Optional[Dict[str, Any]], optional): Dataloader kwargs.
Defaults to None.
Expand All @@ -153,7 +174,16 @@ def fit_transform(
ValueError: If joint_gdf.index is not of type pd.MultiIndex or doesn't have 2 levels.
ValueError: If index levels in gdfs don't overlap correctly.
"""
self.fit(regions_gdf, features_gdf, joint_gdf, trainer_kwargs, dataloader_kwargs)
self.fit(
regions_gdf,
features_gdf,
joint_gdf,
val_regions_gdf,
val_features_gdf,
val_joint_gdf,
trainer_kwargs,
dataloader_kwargs,
)
return self.transform(regions_gdf, features_gdf, joint_gdf)

def _check_is_fitted(self) -> None:
Expand Down
12 changes: 11 additions & 1 deletion srai/embedders/highway2vec/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,24 @@ def training_step(self, batch: "torch.Tensor", batch_idx: int) -> "torch.Tensor"
"""
return self._common_step(batch, batch_idx, "train")

def validation_step(self, batch: "torch.Tensor", batch_idx: int) -> "torch.Tensor":
"""
Validation step.

Args:
batch (torch.Tensor): Batch.
batch_idx (int): Batch index.
"""
return self._common_step(batch, batch_idx, "val")

def _common_step(self, batch: "torch.Tensor", batch_idx: int, stage: str) -> "torch.Tensor":
"""
Perform common step.

Args:
batch (torch.Tensor): Batch.
batch_idx (int): Batch index.
stage (str): Name of the stage - e.g. train, valid, test.
stage (str): Name of the stage - e.g. train, val, test.
"""
import torch.nn.functional as F

Expand Down