-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implemented semantic segmentation (#6)
* Implemented semantic segmentation * Update tests * Temporarily comment tests * Only comment fit_predict tests * uncomment test * uncomment test * uncomment test * Reformat * Trying different hyperparameters * Comment parallel job
- Loading branch information
1 parent
080f2ff
commit 1f2af0d
Showing
8 changed files
with
333 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Semantic segmentation | ||
===================== | ||
|
||
.. automodule:: patsemb.semantic_segmentation | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,14 +21,14 @@ simply running the following command: | |
From GitHub | ||
----------- | ||
|
||
You can also install ``PaTSEmb`` directly from `GitHub <https://github.com/ML-KULeuven/PaTSEmb>`_. | ||
You can also install ``PaTSEmb`` directly from `GitHub`_. | ||
To install version ``X.Y.Z``, you can use the following command: | ||
|
||
.. code-block:: bash | ||
pip install git+https://github.com/ML-KULeuven/[email protected] | ||
The `release page <https://github.com/ML-KULeuven/PaTSEmb/releases>`_ contains more | ||
The `release page`_ contains more | ||
information regarding the different versions. It is also possible to install the | ||
latest, *unreleased* version using the following command: | ||
|
||
|
@@ -40,11 +40,15 @@ From source | |
----------- | ||
|
||
It is also possible to install ``PaTSEmb`` directly from the source code. First, download | ||
the source from `GitHub <https://github.com/ML-KULeuven/PaTSEmb.git>`_. It is also | ||
possible to download the source code for a specific release on `the release page <https://github.com/ML-KULeuven/PaTSEmb/releases>`_. | ||
the source from `GitHub`_. It is also | ||
possible to download the source code for a specific release on the `release page`_. | ||
Unzip the files, and navigate to the root directory of the repository in the terminal. | ||
Finally, ``PaTSEmb`` can be installed through the following command: | ||
|
||
.. code-block:: bash | ||
pip install . | ||
.. _GitHub: https://github.com/ML-KULeuven/PaTSEmb | ||
.. _release page: https://github.com/ML-KULeuven/PaTSEmb/releases |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
149 changes: 149 additions & 0 deletions
149
patsemb/semantic_segmentation/LogisticRegressionSegmentor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
|
||
import inspect | ||
import multiprocessing | ||
import numpy as np | ||
from typing import Union, List | ||
|
||
from sklearn.exceptions import NotFittedError | ||
from sklearn.cluster import KMeans | ||
from sklearn.metrics import silhouette_score | ||
from sklearn.linear_model import LogisticRegression | ||
|
||
from patsemb.semantic_segmentation.ProbabilisticSemanticSegmentor import ProbabilisticSemanticSegmentor | ||
|
||
|
||
class LogisticRegressionSegmentor(ProbabilisticSemanticSegmentor): | ||
""" | ||
Segments the pattern-based embedding using Logistic Regression [carpentier2024pattern]_. | ||
First, a KMeans clustering model is fitted on the embedding, which will | ||
provide a discrete clustering (i.e., every observation in the time series | ||
will be assigned a discrete cluster label). The number of clusters `K` is | ||
decided based on the silhouette method. The discrete clustering give an | ||
initial indication of when the semantic segments occur. | ||
Second, the discrete clustering is fed to a logistic regression model. This | ||
model learns to which segment each time point of the pattern-based embedding | ||
belongs. Because logistic regression is a probabilistic model, we retrieve | ||
the probabilities of a given observation belong to a semantic segment, | ||
thereby obtaining a probabilistic segmentation. | ||
Parameters | ||
---------- | ||
n_segments: int or list of int, default=[2, 3, 4, 5, 6, 7, 8, 9] | ||
The number of segments. If a list of integers is passed, a clustering | ||
will be made for each value, and the best clustering is selected using | ||
the silhouette score. | ||
n_jobs: int, default=1 | ||
The number of jobs to use for computing the multiple clusterings. Has | ||
no effect if ``n_segments`` is an integer. | ||
**kwargs: | ||
Additional arguments to be passed to either ``KMeans`` clutering or | ||
``LogisticRegression`` (both using Sklearn implementation). This class | ||
automatically infers which parameters can be passed to either object | ||
using the ``inspect`` module. If a parameter is valid for both models | ||
(e.g., ``max_iter``), then it will be passed to both. If an additional | ||
argument is given, which is not valid for KMeans nor for LogisticRegression, | ||
a TypeError will be thrown. | ||
A TypeError will also be raised if ``n_clusters`` is passed to this | ||
object - even though it is valid for ``KMeans`` - because this parameter | ||
will be set based on ``n_segments``. | ||
Attributes | ||
---------- | ||
k_means_kwargs: dict | ||
The arguments to pass to SKlearn KMeans. | ||
logistic_regression_kwargs: dict | ||
The arguments to pass to SKlearn LogisticRegression. | ||
logistic_regression_: LogisticRegression | ||
The fitted SKlearn Logistic Regression model. | ||
References | ||
---------- | ||
.. [carpentier2024pattern] Carpentier, Louis, Feremans, Len, Meert, Wannes, Verbeke, Mathias. | ||
"Pattern-based Time Series Semantic Segmentation with Gradual State Transitions." Proceedings | ||
of the 2024 SIAM International Conference on Data Mining (SDM). Society for Industrial and | ||
Applied Mathematics, 2024, doi: `10.1137/1.9781611978032.36 <https://doi.org/10.1137/1.9781611978032.36>`_. | ||
""" | ||
n_segments: Union[int, List[int]] | ||
n_jobs: int | ||
kwargs: dict | ||
|
||
k_means_kwargs: dict | ||
logistic_regression_kwargs: dict | ||
|
||
logistic_regression_: LogisticRegression | ||
|
||
def __init__(self, | ||
n_segments: Union[List[int], int] = None, | ||
n_jobs: int = 1, | ||
**kwargs): | ||
|
||
self.n_segments: List[int] = \ | ||
list(range(2, 10)) if n_segments is None else \ | ||
[n_segments] if isinstance(n_segments, int) else \ | ||
n_segments | ||
self.n_jobs = n_jobs | ||
self.kwargs = kwargs | ||
|
||
# Separate the kwargs | ||
self.k_means_kwargs = {key: value for key, value in kwargs.items() if key in inspect.signature(KMeans).parameters} | ||
self.logistic_regression_kwargs = {key: value for key, value in kwargs.items() if key in inspect.signature(LogisticRegression).parameters} | ||
|
||
if 'n_clusters' in self.k_means_kwargs: | ||
raise TypeError("Parameter 'n_clusters' should not be passed!") | ||
|
||
# Check if invalid arguments were given | ||
valid_kwargs = dict(self.k_means_kwargs, **self.logistic_regression_kwargs) | ||
if len(valid_kwargs) != len(kwargs): | ||
invalid_kwargs = [arg for arg in kwargs.keys() if arg not in valid_kwargs] | ||
raise TypeError(f"Parameters were given that do not belong to K-Means or Logistic Regression: {invalid_kwargs}") | ||
|
||
def fit(self, X: np.ndarray, y=None) -> 'ProbabilisticSemanticSegmentor': | ||
|
||
# If there is only one value for n_segments given, we can simply compute the clustering | ||
if len(self.n_segments) == 1: | ||
clustering = KMeans(n_clusters=self.n_segments[0], **self.k_means_kwargs).fit_predict(X.T) | ||
|
||
# Otherwise, use parallelization and select the best clustering | ||
else: | ||
|
||
# Compute clusters with different number of segments | ||
args = [(X.T, n_segments) for n_segments in self.n_segments] | ||
if self.n_jobs > 1: | ||
with multiprocessing.Pool(self.n_jobs) as pool: | ||
pool_results = pool.starmap(self._compute_kmeans_segmentation, args) | ||
else: | ||
pool_results = [self._compute_kmeans_segmentation(*arg) for arg in args] | ||
|
||
# Identify the best cluster with maximum silhouette score | ||
index_largest_silhouette_score = np.argmax([silhouette_avg for silhouette_avg, *_ in pool_results]) | ||
clustering = pool_results[index_largest_silhouette_score][1] | ||
|
||
# Fit the logistic regression model | ||
self.logistic_regression_ = LogisticRegression(**self.logistic_regression_kwargs) | ||
self.logistic_regression_.fit(X.T, clustering) | ||
|
||
# Return self | ||
return self | ||
|
||
def predict_proba(self, X: np.ndarray) -> np.ndarray: | ||
if not hasattr(self, 'logistic_regression_'): | ||
raise NotFittedError('Call the fit method before predicting!') | ||
return self.logistic_regression_.predict_proba(X.T) | ||
|
||
def _compute_kmeans_segmentation(self, X: np.ndarray, n_segments: int): | ||
# Cluster the embedding | ||
k_means = KMeans(n_clusters=n_segments, **self.k_means_kwargs) | ||
segmentation = k_means.fit_predict(X) | ||
|
||
# Compute silhouette score | ||
if len(set(segmentation)) != n_segments: | ||
silhouette_avg = -1 | ||
else: | ||
n = X.shape[0] | ||
sample_size = n if n < 2000 else 2000 + int(0.1 * (n - 2000)) | ||
silhouette_avg = silhouette_score(X, segmentation, sample_size=sample_size) | ||
|
||
return silhouette_avg, segmentation |
73 changes: 73 additions & 0 deletions
73
patsemb/semantic_segmentation/ProbabilisticSemanticSegmentor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
|
||
import abc | ||
import numpy as np | ||
|
||
|
||
class ProbabilisticSemanticSegmentor(abc.ABC): | ||
""" | ||
Learn a probabilistic semantic segmentation over the pattern-based | ||
embedding. This enables to learn gradual transitions over the semantic | ||
segmentation as intervals where the probability of one semantic segment | ||
increases while the probability of another semantic segment decreases. | ||
Because segment probabilities are predicted, this class uses the fit-predict_proba | ||
interface (including a ``fit_predict_proba`` method) to make predictions. | ||
See Also | ||
-------- | ||
LogisticRegressionSegmentor: predict semantic segments using logistic regression. | ||
""" | ||
|
||
@abc.abstractmethod | ||
def fit(self, X: np.ndarray, y=None) -> 'ProbabilisticSemanticSegmentor': | ||
""" | ||
Fit this probabilistic semantic segmentor. | ||
Parameters | ||
---------- | ||
X: np.ndarray of shape (n_patterns, n_samples) | ||
The embedding matrix to use for fitting this probabilistic semantic segmentor. | ||
y: array-like, default=None | ||
Ground-truth information. | ||
Returns | ||
------- | ||
self: ProbabilisticSemanticSegmentor | ||
Returns the instance itself | ||
""" | ||
|
||
@abc.abstractmethod | ||
def predict_proba(self, X: np.ndarray) -> np.ndarray: | ||
""" | ||
Predict the probabilistic semantic segment probabilities, based on | ||
the given pattern-based embedding. | ||
Parameters | ||
---------- | ||
X: np.ndarray of shape (n_patterns, n_samples) | ||
The embedding matrix which should be transformed. | ||
Returns | ||
------- | ||
segment_probabilities: np.ndarray of shape (n_samples, n_segments) | ||
The predicted semantic segment probabilities. | ||
""" | ||
|
||
def fit_predict_proba(self, X: np.ndarray, y=None) -> np.ndarray: | ||
""" | ||
Fit this postprocessor using the given pattern-based embedding, and | ||
immediately transform it. | ||
Parameters | ||
---------- | ||
X: np.ndarray of shape (n_patterns, n_samples) | ||
The embedding matrix to use for fitting this probabilistic semantic segmentor. | ||
y: array-like, default=None | ||
Ground-truth information. | ||
Returns | ||
------- | ||
segment_probabilities: np.ndarray of shape (n_samples, n_segments) | ||
The predicted semantic segment probabilities. | ||
""" | ||
return self.fit(X, y).predict_proba(X) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
|
||
""" | ||
This module offers functionality to compute a semantic segmentation from | ||
a pattern-based embedding. It can be imported as follows: | ||
>>> from patsemb import semantic_segmentation | ||
Currently, only a probabilistic semantic segmentor is implemented. This segmentor | ||
uses the fit-predict_proba interface, because it predicts segment probabilities | ||
instead of segment labels. | ||
""" | ||
|
||
from .ProbabilisticSemanticSegmentor import ProbabilisticSemanticSegmentor | ||
from .LogisticRegressionSegmentor import LogisticRegressionSegmentor | ||
|
||
__all__ = [ | ||
'ProbabilisticSemanticSegmentor', | ||
'LogisticRegressionSegmentor' | ||
] |
Empty file.
77 changes: 77 additions & 0 deletions
77
tests/semantic_segmentation/test_LogisticRegressionSegmentor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import numpy as np | ||
import pytest | ||
from sklearn.exceptions import NotFittedError | ||
from patsemb.semantic_segmentation import LogisticRegressionSegmentor | ||
from patsemb.pattern_based_embedding import PatternBasedEmbedder | ||
|
||
|
||
@pytest.fixture | ||
def pattern_based_embedding() -> np.ndarray: | ||
univariate_time_series = np.sin(np.arange(0, 50, 0.05)) + np.random.normal(0, 0.25, 1000) | ||
return PatternBasedEmbedder().fit_transform(univariate_time_series) | ||
|
||
|
||
class TestLogisticRegressionSegmentor: | ||
|
||
def test_initialization_n_segments(self): | ||
clf = LogisticRegressionSegmentor() | ||
assert clf.n_segments == [2, 3, 4, 5, 6, 7, 8, 9] | ||
|
||
clf = LogisticRegressionSegmentor(n_segments=[2, 3, 4, 5]) | ||
assert clf.n_segments == [2, 3, 4, 5] | ||
|
||
clf = LogisticRegressionSegmentor(n_segments=4) | ||
assert clf.n_segments == [4] | ||
|
||
def test_initialization_n_jobs(self): | ||
clf = LogisticRegressionSegmentor() | ||
assert clf.n_jobs == 1 | ||
|
||
clf = LogisticRegressionSegmentor(n_jobs=4) | ||
assert clf.n_jobs == 4 | ||
|
||
def test_initialization_kwargs(self): | ||
clf = LogisticRegressionSegmentor(penalty='l2', tol=1e-4, init='random', max_iter=50) | ||
assert 'tol' in clf.k_means_kwargs | ||
assert 'init' in clf.k_means_kwargs | ||
assert 'max_iter' in clf.k_means_kwargs | ||
assert len(clf.k_means_kwargs) == 3 | ||
|
||
assert 'penalty' in clf.logistic_regression_kwargs | ||
assert 'tol' in clf.logistic_regression_kwargs | ||
assert 'max_iter' in clf.logistic_regression_kwargs | ||
assert len(clf.logistic_regression_kwargs) == 3 | ||
|
||
def test_initialization_n_clusters(self): | ||
with pytest.raises(TypeError): | ||
LogisticRegressionSegmentor(n_clusters=5) | ||
|
||
def test_initialization_additional_args(self): | ||
with pytest.raises(TypeError): | ||
LogisticRegressionSegmentor(something_invalid=0) | ||
|
||
def test_fit(self, pattern_based_embedding): | ||
clf = LogisticRegressionSegmentor() | ||
assert clf.fit(pattern_based_embedding) == clf | ||
|
||
def test_predict_proba(self, pattern_based_embedding): | ||
clf = LogisticRegressionSegmentor() | ||
clf.fit(pattern_based_embedding) | ||
pred = clf.predict_proba(pattern_based_embedding) | ||
assert pred.shape[0] == pattern_based_embedding.shape[1] | ||
|
||
def test_fit_predict_proba(self, pattern_based_embedding): | ||
pred = LogisticRegressionSegmentor().fit_predict_proba(pattern_based_embedding) | ||
assert pred.shape[0] == pattern_based_embedding.shape[1] | ||
|
||
def test_fit_predict_proba_one_n_segment(self, pattern_based_embedding): | ||
pred = LogisticRegressionSegmentor(n_segments=3).fit_predict_proba(pattern_based_embedding) | ||
assert pred.shape == (pattern_based_embedding.shape[1], 3) | ||
|
||
# def test_fit_predict_proba_multiple_jobs(self, pattern_based_embedding): | ||
# pred = LogisticRegressionSegmentor(n_segments=[3, 4], n_jobs=2).fit_predict_proba(pattern_based_embedding) | ||
# assert pred.shape[0] == pattern_based_embedding.shape[1] | ||
|
||
def test_predict_proba_not_fitted(self, pattern_based_embedding): | ||
with pytest.raises(NotFittedError): | ||
LogisticRegressionSegmentor().predict_proba(pattern_based_embedding) |