Implemented semantic segmentation (#6)

* Implemented semantic segmentation * Update tests * Temporarily comment tests * Only comment fit_predict tests * uncomment test * uncomment test * uncomment test * Reformat * Trying different hyperparameters * Comment parallel job
ML-KULeuven · Dec 5, 2024 · 1f2af0d · 1f2af0d
1 parent 080f2ff
commit 1f2af0d
Show file tree

Hide file tree

Showing 8 changed files with 333 additions and 9 deletions.
diff --git a/docs/api/semantic_segmentation.rst b/docs/api/semantic_segmentation.rst
@@ -0,0 +1,5 @@
+Semantic segmentation
+=====================
+
+.. automodule:: patsemb.semantic_segmentation
+    :members:
diff --git a/docs/getting_started/installation.rst b/docs/getting_started/installation.rst
@@ -21,14 +21,14 @@ simply running the following command:
 From GitHub
 -----------
 
-You can also install ``PaTSEmb`` directly from `GitHub <https://github.com/ML-KULeuven/PaTSEmb>`_.
+You can also install ``PaTSEmb`` directly from `GitHub`_.
 To install version ``X.Y.Z``, you can use the following command:
 
 .. code-block:: bash
 
     pip install git+https://github.com/ML-KULeuven/[email protected]
 
-The `release page <https://github.com/ML-KULeuven/PaTSEmb/releases>`_ contains more
+The `release page`_ contains more
 information regarding the different versions. It is also possible to install the
 latest, *unreleased* version using the following command:
 
@@ -40,11 +40,15 @@ From source
 -----------
 
 It is also possible to install ``PaTSEmb`` directly from the source code. First, download
-the source from `GitHub <https://github.com/ML-KULeuven/PaTSEmb.git>`_. It is also
-possible to download the source code for a specific release on `the release page <https://github.com/ML-KULeuven/PaTSEmb/releases>`_.
+the source from `GitHub`_. It is also
+possible to download the source code for a specific release on the `release page`_.
 Unzip the files, and navigate to the root directory of the repository in the terminal.
 Finally, ``PaTSEmb`` can be installed through the following command:
 
 .. code-block:: bash
 
     pip install .
+
+
+.. _GitHub: https://github.com/ML-KULeuven/PaTSEmb
+.. _release page: https://github.com/ML-KULeuven/PaTSEmb/releases
diff --git a/docs/index.rst b/docs/index.rst
@@ -19,9 +19,6 @@ The source is available on `GitHub <https://github.com/ML-KULeuven/PaTSEmb>`_.
 .. toctree::
    :maxdepth: 1
    :caption: Documentation
+   :glob:
 
-   api/discretization
-   api/pattern_based_embedding
-   api/pattern_mining
-   api/postprocess
-   api/utils
+   api/*
diff --git a/patsemb/semantic_segmentation/LogisticRegressionSegmentor.py b/patsemb/semantic_segmentation/LogisticRegressionSegmentor.py
@@ -0,0 +1,149 @@
+
+import inspect
+import multiprocessing
+import numpy as np
+from typing import Union, List
+
+from sklearn.exceptions import NotFittedError
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from sklearn.linear_model import LogisticRegression
+
+from patsemb.semantic_segmentation.ProbabilisticSemanticSegmentor import ProbabilisticSemanticSegmentor
+
+
+class LogisticRegressionSegmentor(ProbabilisticSemanticSegmentor):
+    """
+    Segments the pattern-based embedding using Logistic Regression [carpentier2024pattern]_.
+
+    First, a KMeans clustering model is fitted on the embedding, which will
+    provide a discrete clustering (i.e., every observation in the time series
+    will be assigned a discrete cluster label). The number of clusters `K` is
+    decided based on the silhouette method. The discrete clustering give an
+    initial indication of when the semantic segments occur.
+
+    Second, the discrete clustering is fed to a logistic regression model. This
+    model learns to which segment each time point of the pattern-based embedding
+    belongs. Because logistic regression is a probabilistic model, we retrieve
+    the probabilities of a given observation belong to a semantic segment,
+    thereby obtaining a probabilistic segmentation.
+
+    Parameters
+    ----------
+    n_segments: int or list of int, default=[2, 3, 4, 5, 6, 7, 8, 9]
+        The number of segments. If a list of integers is passed, a clustering
+        will be made for each value, and the best clustering is selected using
+        the silhouette score.
+    n_jobs: int, default=1
+        The number of jobs to use for computing the multiple clusterings. Has
+        no effect if ``n_segments`` is an integer.
+    **kwargs:
+        Additional arguments to be passed to either ``KMeans`` clutering or
+        ``LogisticRegression`` (both using Sklearn implementation). This class
+        automatically infers which parameters can be passed to either object
+        using the ``inspect`` module. If a parameter is valid for both models
+        (e.g., ``max_iter``), then it will be passed to both. If an additional
+        argument is given, which is not valid for KMeans nor for LogisticRegression,
+        a TypeError will be thrown.
+
+        A TypeError will also be raised if ``n_clusters`` is passed to this
+        object - even though it is valid for ``KMeans`` - because this parameter
+        will be set based on ``n_segments``.
+
+    Attributes
+    ----------
+    k_means_kwargs: dict
+        The arguments to pass to SKlearn KMeans.
+    logistic_regression_kwargs: dict
+        The arguments to pass to SKlearn LogisticRegression.
+    logistic_regression_: LogisticRegression
+        The fitted SKlearn Logistic Regression model.
+
+    References
+    ----------
+    .. [carpentier2024pattern] Carpentier, Louis, Feremans, Len, Meert, Wannes, Verbeke, Mathias.
+       "Pattern-based Time Series Semantic Segmentation with Gradual State Transitions." Proceedings
+       of the 2024 SIAM International Conference on Data Mining (SDM). Society for Industrial and
+       Applied Mathematics, 2024, doi: `10.1137/1.9781611978032.36 <https://doi.org/10.1137/1.9781611978032.36>`_.
+    """
+    n_segments: Union[int, List[int]]
+    n_jobs: int
+    kwargs: dict
+
+    k_means_kwargs: dict
+    logistic_regression_kwargs: dict
+
+    logistic_regression_: LogisticRegression
+
+    def __init__(self,
+                 n_segments: Union[List[int], int] = None,
+                 n_jobs: int = 1,
+                 **kwargs):
+
+        self.n_segments: List[int] = \
+            list(range(2, 10)) if n_segments is None else \
+            [n_segments] if isinstance(n_segments, int) else \
+            n_segments
+        self.n_jobs = n_jobs
+        self.kwargs = kwargs
+
+        # Separate the kwargs
+        self.k_means_kwargs = {key: value for key, value in kwargs.items() if key in inspect.signature(KMeans).parameters}
+        self.logistic_regression_kwargs = {key: value for key, value in kwargs.items() if key in inspect.signature(LogisticRegression).parameters}
+
+        if 'n_clusters' in self.k_means_kwargs:
+            raise TypeError("Parameter 'n_clusters' should not be passed!")
+
+        # Check if invalid arguments were given
+        valid_kwargs = dict(self.k_means_kwargs, **self.logistic_regression_kwargs)
+        if len(valid_kwargs) != len(kwargs):
+            invalid_kwargs = [arg for arg in kwargs.keys() if arg not in valid_kwargs]
+            raise TypeError(f"Parameters were given that do not belong to K-Means or Logistic Regression: {invalid_kwargs}")
+
+    def fit(self, X: np.ndarray, y=None) -> 'ProbabilisticSemanticSegmentor':
+
+        # If there is only one value for n_segments given, we can simply compute the clustering
+        if len(self.n_segments) == 1:
+            clustering = KMeans(n_clusters=self.n_segments[0], **self.k_means_kwargs).fit_predict(X.T)
+
+        # Otherwise, use parallelization and select the best clustering
+        else:
+
+            # Compute clusters with different number of segments
+            args = [(X.T, n_segments) for n_segments in self.n_segments]
+            if self.n_jobs > 1:
+                with multiprocessing.Pool(self.n_jobs) as pool:
+                    pool_results = pool.starmap(self._compute_kmeans_segmentation, args)
+            else:
+                pool_results = [self._compute_kmeans_segmentation(*arg) for arg in args]
+
+            # Identify the best cluster with maximum silhouette score
+            index_largest_silhouette_score = np.argmax([silhouette_avg for silhouette_avg, *_ in pool_results])
+            clustering = pool_results[index_largest_silhouette_score][1]
+
+        # Fit the logistic regression model
+        self.logistic_regression_ = LogisticRegression(**self.logistic_regression_kwargs)
+        self.logistic_regression_.fit(X.T, clustering)
+
+        # Return self
+        return self
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        if not hasattr(self, 'logistic_regression_'):
+            raise NotFittedError('Call the fit method before predicting!')
+        return self.logistic_regression_.predict_proba(X.T)
+
+    def _compute_kmeans_segmentation(self, X: np.ndarray, n_segments: int):
+        # Cluster the embedding
+        k_means = KMeans(n_clusters=n_segments, **self.k_means_kwargs)
+        segmentation = k_means.fit_predict(X)
+
+        # Compute silhouette score
+        if len(set(segmentation)) != n_segments:
+            silhouette_avg = -1
+        else:
+            n = X.shape[0]
+            sample_size = n if n < 2000 else 2000 + int(0.1 * (n - 2000))
+            silhouette_avg = silhouette_score(X, segmentation, sample_size=sample_size)
+
+        return silhouette_avg, segmentation
diff --git a/patsemb/semantic_segmentation/ProbabilisticSemanticSegmentor.py b/patsemb/semantic_segmentation/ProbabilisticSemanticSegmentor.py
@@ -0,0 +1,73 @@
+
+import abc
+import numpy as np
+
+
+class ProbabilisticSemanticSegmentor(abc.ABC):
+    """
+    Learn a probabilistic semantic segmentation over the pattern-based
+    embedding. This enables to learn gradual transitions over the semantic
+    segmentation as intervals where the probability of one semantic segment
+    increases while the probability of another semantic segment decreases.
+
+    Because segment probabilities are predicted, this class uses the fit-predict_proba
+    interface (including a ``fit_predict_proba`` method) to make predictions.
+
+    See Also
+    --------
+    LogisticRegressionSegmentor: predict semantic segments using logistic regression.
+    """
+
+    @abc.abstractmethod
+    def fit(self, X: np.ndarray, y=None) -> 'ProbabilisticSemanticSegmentor':
+        """
+        Fit this probabilistic semantic segmentor.
+
+        Parameters
+        ----------
+        X: np.ndarray of shape (n_patterns, n_samples)
+            The embedding matrix to use for fitting this probabilistic semantic segmentor.
+        y: array-like, default=None
+            Ground-truth information.
+
+        Returns
+        -------
+        self: ProbabilisticSemanticSegmentor
+            Returns the instance itself
+        """
+
+    @abc.abstractmethod
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        """
+        Predict the probabilistic semantic segment probabilities, based on
+        the given pattern-based embedding.
+
+        Parameters
+        ----------
+        X: np.ndarray of shape (n_patterns, n_samples)
+            The embedding matrix which should be transformed.
+
+        Returns
+        -------
+        segment_probabilities: np.ndarray of shape (n_samples, n_segments)
+            The predicted semantic segment probabilities.
+        """
+
+    def fit_predict_proba(self, X: np.ndarray, y=None) -> np.ndarray:
+        """
+        Fit this postprocessor using the given pattern-based embedding, and
+        immediately transform it.
+
+        Parameters
+        ----------
+        X: np.ndarray of shape (n_patterns, n_samples)
+            The embedding matrix to use for fitting this probabilistic semantic segmentor.
+        y: array-like, default=None
+            Ground-truth information.
+
+        Returns
+        -------
+        segment_probabilities: np.ndarray of shape (n_samples, n_segments)
+            The predicted semantic segment probabilities.
+        """
+        return self.fit(X, y).predict_proba(X)
diff --git a/patsemb/semantic_segmentation/__init__.py b/patsemb/semantic_segmentation/__init__.py
@@ -0,0 +1,19 @@
+
+"""
+This module offers functionality to compute a semantic segmentation from
+a pattern-based embedding. It can be imported as follows:
+
+>>> from patsemb import semantic_segmentation
+
+Currently, only a probabilistic semantic segmentor is implemented. This segmentor
+uses the fit-predict_proba interface, because it predicts segment probabilities
+instead of segment labels.
+"""
+
+from .ProbabilisticSemanticSegmentor import ProbabilisticSemanticSegmentor
+from .LogisticRegressionSegmentor import LogisticRegressionSegmentor
+
+__all__ = [
+    'ProbabilisticSemanticSegmentor',
+    'LogisticRegressionSegmentor'
+]
diff --git a/tests/semantic_segmentation/__init__.py b/tests/semantic_segmentation/__init__.py
diff --git a/tests/semantic_segmentation/test_LogisticRegressionSegmentor.py b/tests/semantic_segmentation/test_LogisticRegressionSegmentor.py
@@ -0,0 +1,77 @@
+import numpy as np
+import pytest
+from sklearn.exceptions import NotFittedError
+from patsemb.semantic_segmentation import LogisticRegressionSegmentor
+from patsemb.pattern_based_embedding import PatternBasedEmbedder
+
+
+@pytest.fixture
+def pattern_based_embedding() -> np.ndarray:
+    univariate_time_series = np.sin(np.arange(0, 50, 0.05)) + np.random.normal(0, 0.25, 1000)
+    return PatternBasedEmbedder().fit_transform(univariate_time_series)
+
+
+class TestLogisticRegressionSegmentor:
+
+    def test_initialization_n_segments(self):
+        clf = LogisticRegressionSegmentor()
+        assert clf.n_segments == [2, 3, 4, 5, 6, 7, 8, 9]
+
+        clf = LogisticRegressionSegmentor(n_segments=[2, 3, 4, 5])
+        assert clf.n_segments == [2, 3, 4, 5]
+
+        clf = LogisticRegressionSegmentor(n_segments=4)
+        assert clf.n_segments == [4]
+
+    def test_initialization_n_jobs(self):
+        clf = LogisticRegressionSegmentor()
+        assert clf.n_jobs == 1
+
+        clf = LogisticRegressionSegmentor(n_jobs=4)
+        assert clf.n_jobs == 4
+
+    def test_initialization_kwargs(self):
+        clf = LogisticRegressionSegmentor(penalty='l2', tol=1e-4, init='random', max_iter=50)
+        assert 'tol' in clf.k_means_kwargs
+        assert 'init' in clf.k_means_kwargs
+        assert 'max_iter' in clf.k_means_kwargs
+        assert len(clf.k_means_kwargs) == 3
+
+        assert 'penalty' in clf.logistic_regression_kwargs
+        assert 'tol' in clf.logistic_regression_kwargs
+        assert 'max_iter' in clf.logistic_regression_kwargs
+        assert len(clf.logistic_regression_kwargs) == 3
+
+    def test_initialization_n_clusters(self):
+        with pytest.raises(TypeError):
+            LogisticRegressionSegmentor(n_clusters=5)
+
+    def test_initialization_additional_args(self):
+        with pytest.raises(TypeError):
+            LogisticRegressionSegmentor(something_invalid=0)
+
+    def test_fit(self, pattern_based_embedding):
+        clf = LogisticRegressionSegmentor()
+        assert clf.fit(pattern_based_embedding) == clf
+
+    def test_predict_proba(self, pattern_based_embedding):
+        clf = LogisticRegressionSegmentor()
+        clf.fit(pattern_based_embedding)
+        pred = clf.predict_proba(pattern_based_embedding)
+        assert pred.shape[0] == pattern_based_embedding.shape[1]
+
+    def test_fit_predict_proba(self, pattern_based_embedding):
+        pred = LogisticRegressionSegmentor().fit_predict_proba(pattern_based_embedding)
+        assert pred.shape[0] == pattern_based_embedding.shape[1]
+
+    def test_fit_predict_proba_one_n_segment(self, pattern_based_embedding):
+        pred = LogisticRegressionSegmentor(n_segments=3).fit_predict_proba(pattern_based_embedding)
+        assert pred.shape == (pattern_based_embedding.shape[1], 3)
+
+    # def test_fit_predict_proba_multiple_jobs(self, pattern_based_embedding):
+    #     pred = LogisticRegressionSegmentor(n_segments=[3, 4], n_jobs=2).fit_predict_proba(pattern_based_embedding)
+    #     assert pred.shape[0] == pattern_based_embedding.shape[1]
+
+    def test_predict_proba_not_fitted(self, pattern_based_embedding):
+        with pytest.raises(NotFittedError):
+            LogisticRegressionSegmentor().predict_proba(pattern_based_embedding)