From 422ba54ef34c47977c39fa7ac9dcd2e3a03cb063 Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Thu, 26 Oct 2023 17:08:38 -0300 Subject: [PATCH 1/7] add NoDrift detector --- docs/releases/unreleased.md | 12 +++-- river/drift/__init__.py | 2 + river/drift/no_drift.py | 75 ++++++++++++++++++++++++++ river/forest/adaptive_random_forest.py | 57 ++++++++++---------- 4 files changed, 112 insertions(+), 34 deletions(-) create mode 100644 river/drift/no_drift.py diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index bb58355b93..9a6f0ba2d3 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -8,6 +8,10 @@ River's mini-batch methods now support pandas v2. In particular, River conforms - Made `score_one` method of `anomaly.LocalOutlierFactor` stateless - Defined default score for uninitialized detector +## covariance + +- Added `_from_state` method to `covariance.EmpiricalCovariance` to warm start from previous knowledge. + ## clustering - Add fixes to `cluster.DBSTREAM` algorithm, including: @@ -22,13 +26,13 @@ River's mini-batch methods now support pandas v2. In particular, River conforms - Added `datasets.WebTraffic`, which is a dataset that counts the occurrences of events on a website. It is a multi-output regression dataset with two outputs. -## forest +## drift -- Simplify inner the structures of `forest.ARFClassifier` and `forest.ARFRegressor` by removing redundant class hierarchy. Simplify how concept drift logging can be accessed in individual trees and in the forest as a whole. +- Add `drift.NoDrift` to allow disabling the drift detection capabilities of models. This detector does nothing and always returns `False` when queried whether or not a concept drift was detected. -## covariance +## forest -- Added `_from_state` method to `covariance.EmpiricalCovariance` to warm start from previous knowledge. +- Simplify inner the structures of `forest.ARFClassifier` and `forest.ARFRegressor` by removing redundant class hierarchy. Simplify how concept drift logging can be accessed in individual trees and in the forest as a whole. ## proba diff --git a/river/drift/__init__.py b/river/drift/__init__.py index 70ba3746c1..b434cd7a5b 100644 --- a/river/drift/__init__.py +++ b/river/drift/__init__.py @@ -12,6 +12,7 @@ from .adwin import ADWIN from .dummy import DummyDriftDetector from .kswin import KSWIN +from .no_drift import NoDrift from .page_hinkley import PageHinkley from .retrain import DriftRetrainingClassifier @@ -22,6 +23,7 @@ "DriftRetrainingClassifier", "DummyDriftDetector", "KSWIN", + "NoDrift", "PageHinkley", "PeriodicTrigger", ] diff --git a/river/drift/no_drift.py b/river/drift/no_drift.py new file mode 100644 index 0000000000..3041519320 --- /dev/null +++ b/river/drift/no_drift.py @@ -0,0 +1,75 @@ +from river import base +from river.base.drift_detector import DriftDetector + + +class NoDrift(base.DriftDetector): + """Dummy class used to turn off concept drift detection capabilities of adaptive models. + + + It always signals that no concept drift was detected. + + + Examples + -------- + >>> from river import drift + >>> from river import evaluate + >>> from river import forest + >>> from river import metrics + >>> from river.datasets import synth + + >>> dataset = synth.ConceptDriftStream( + ... seed=93, + ... position=500, + ... width=40, + ... ) + + We can turn off the warning detection capabilities of Adaptive Random Forest (ARF) or + other similar models. Thus, the base models will reset immediately after identifying a drift, + bypassing the background model building phase: + + >>> model = forest.ARFClassifier( + ... leaf_prediction="mc", + ... warning_detector=drift.NoDrift(), + ... seed=8 + ... ) + >>> metric = metrics.Accuracy() + + >>> evaluate.progressive_val_score(dataset.take(700), model, metric) + Accuracy: 69.96% + + >>> model.n_drifts_detected() + 1 + + >>> model.n_warnings_detected() + 0 + + We can also turn off the concept drift handling capabilities completely: + + >>> stationary_model = forest.ARFClassifier( + ... leaf_prediction="mc", + ... warning_detector=drift.NoDrift(), + ... drift_detector=drift.NoDrift(), + ... seed=8 + ... ) + >>> metric = metrics.Accuracy() + + >>> evaluate.progressive_val_score(dataset.take(700), stationary_model, metric) + Accuracy: 71.10% + + >>> stationary_model.n_drifts_detected() + 0 + + >>> stationary_model.n_warnings_detected() + 0 + + """ + + def __init__(self): + super().__init__() + + def update(self, x: int | float) -> DriftDetector: + return self + + @property + def drift_detected(self): + return False \ No newline at end of file diff --git a/river/forest/adaptive_random_forest.py b/river/forest/adaptive_random_forest.py index 8e7a9c6645..c96d83e231 100644 --- a/river/forest/adaptive_random_forest.py +++ b/river/forest/adaptive_random_forest.py @@ -10,7 +10,7 @@ import numpy as np from river import base, metrics, stats -from river.drift import ADWIN +from river.drift import ADWIN, NoDrift from river.tree.hoeffding_tree_classifier import HoeffdingTreeClassifier from river.tree.hoeffding_tree_regressor import HoeffdingTreeRegressor from river.tree.nodes.arf_htc_nodes import ( @@ -32,8 +32,8 @@ def __init__( n_models: int, max_features: bool | str | int, lambda_value: int, - drift_detector: base.DriftDetector | None, - warning_detector: base.DriftDetector | None, + drift_detector: base.DriftDetector, + warning_detector: base.DriftDetector, metric: metrics.base.MultiClassMetric | metrics.base.RegressionMetric, disable_weighted_vote, seed, @@ -50,20 +50,21 @@ def __init__( self._rng = random.Random(self.seed) - self._warning_detectors: list[base.DriftDetector] = ( - None # type: ignore - if self.warning_detector is None - else [self.warning_detector.clone() for _ in range(self.n_models)] - ) - self._drift_detectors: list[base.DriftDetector] = ( - None # type: ignore - if self.drift_detector is None - else [self.drift_detector.clone() for _ in range(self.n_models)] - ) + self._warning_detectors: list[base.DriftDetector] + self._warning_detection_disabled = True + if not isinstance(self.warning_detector, NoDrift): + self._warning_detectors = [self.warning_detector.clone() for _ in range(self.n_models)] + self._warning_detection_disabled = False + + self._drift_detectors: list[base.DriftDetector] + self._drift_detection_disabled = True + if not isinstance(self.drift_detector, NoDrift): + self._drift_detectors = [self.drift_detector.clone() for _ in range(self.n_models)] + self._drift_detection_disabled = False # The background models self._background: list[BaseTreeClassifier | BaseTreeRegressor | None] = ( - None if self.warning_detector is None else [None] * self.n_models # type: ignore + None if self._warning_detection_disabled else [None] * self.n_models # type: ignore ) # Performance metrics used for weighted voting/aggregation @@ -71,10 +72,10 @@ def __init__( # Drift and warning logging self._warning_tracker: dict = ( - collections.defaultdict(int) if self.warning_detector is not None else None # type: ignore + collections.defaultdict(int) if not self._warning_detection_disabled else None # type: ignore ) self._drift_tracker: dict = ( - collections.defaultdict(int) if self.drift_detector is not None else None # type: ignore + collections.defaultdict(int) if not self._drift_detection_disabled else None # type: ignore ) @property @@ -101,12 +102,10 @@ def _drift_detector_input( def _new_base_model(self) -> BaseTreeClassifier | BaseTreeRegressor: raise NotImplementedError - def n_warnings_detected(self, tree_id: int | None = None) -> int | None: + def n_warnings_detected(self, tree_id: int | None = None) -> int: """Get the total number of concept drift warnings detected, or the number on an individual tree basis (optionally). - If warning detection is disabled, will return `None`. - Parameters ---------- tree_id @@ -119,20 +118,18 @@ def n_warnings_detected(self, tree_id: int | None = None) -> int | None: """ - if self.warning_detector is None: - return None + if self._warning_detection_disabled: + return 0 if tree_id is None: return sum(self._warning_tracker.values()) return self._warning_tracker[tree_id] - def n_drifts_detected(self, tree_id: int | None = None) -> int | None: + def n_drifts_detected(self, tree_id: int | None = None) -> int: """Get the total number of concept drifts detected, or such number on an individual tree basis (optionally). - If drift detection is disabled, will return `None`. - Parameters ---------- tree_id @@ -145,8 +142,8 @@ def n_drifts_detected(self, tree_id: int | None = None) -> int | None: """ - if self.drift_detector is None: - return None + if self._drift_detection_disabled: + return 0 if tree_id is None: return sum(self._drift_tracker.values()) @@ -171,13 +168,13 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs): k = poisson(rate=self.lambda_value, rng=self._rng) if k > 0: - if self.warning_detector is not None and self._background[i] is not None: + if not self._warning_detection_disabled and self._background[i] is not None: self._background[i].learn_one(x=x, y=y, sample_weight=k) # type: ignore model.learn_one(x=x, y=y, sample_weight=k) drift_input = None - if self.drift_detector is not None and self.warning_detector is not None: + if not self._warning_detection_disabled: drift_input = self._drift_detector_input(i, y, y_pred) self._warning_detectors[i].update(drift_input) @@ -189,7 +186,7 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs): # Update warning tracker self._warning_tracker[i] += 1 - if self.drift_detector is not None: + if not self._drift_detection_disabled: drift_input = ( drift_input if drift_input is not None @@ -198,7 +195,7 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs): self._drift_detectors[i].update(drift_input) if self._drift_detectors[i].drift_detected: - if self.warning_detector is not None and self._background[i] is not None: + if not self._warning_detection_disabled and self._background[i] is not None: self.data[i] = self._background[i] self._background[i] = None self._warning_detectors[i] = self.warning_detector.clone() From bc9a97988e127076b54e30ae0866743a948a61ea Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Thu, 26 Oct 2023 17:09:59 -0300 Subject: [PATCH 2/7] lint --- river/drift/no_drift.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/river/drift/no_drift.py b/river/drift/no_drift.py index 3041519320..1166bd0c96 100644 --- a/river/drift/no_drift.py +++ b/river/drift/no_drift.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from river import base from river.base.drift_detector import DriftDetector @@ -72,4 +74,4 @@ def update(self, x: int | float) -> DriftDetector: @property def drift_detected(self): - return False \ No newline at end of file + return False From efe7bb3dfdd360e5cb49519d03a1ee0f749a10ae Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Thu, 26 Oct 2023 17:21:38 -0300 Subject: [PATCH 3/7] update test --- river/drift/no_drift.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/river/drift/no_drift.py b/river/drift/no_drift.py index 1166bd0c96..e4f2e66fc0 100644 --- a/river/drift/no_drift.py +++ b/river/drift/no_drift.py @@ -20,7 +20,7 @@ class NoDrift(base.DriftDetector): >>> from river.datasets import synth >>> dataset = synth.ConceptDriftStream( - ... seed=93, + ... seed=8, ... position=500, ... width=40, ... ) @@ -37,10 +37,10 @@ class NoDrift(base.DriftDetector): >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset.take(700), model, metric) - Accuracy: 69.96% + Accuracy: 76.25% >>> model.n_drifts_detected() - 1 + 2 >>> model.n_warnings_detected() 0 @@ -56,7 +56,7 @@ class NoDrift(base.DriftDetector): >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset.take(700), stationary_model, metric) - Accuracy: 71.10% + Accuracy: 76.25% >>> stationary_model.n_drifts_detected() 0 From 8e4e76637e5a535ab58f8ea57cecd40f4f59d9d5 Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Thu, 26 Oct 2023 18:26:04 -0300 Subject: [PATCH 4/7] Update river/drift/no_drift.py Co-authored-by: Max Halford --- river/drift/no_drift.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/river/drift/no_drift.py b/river/drift/no_drift.py index e4f2e66fc0..55bcdb5bb5 100644 --- a/river/drift/no_drift.py +++ b/river/drift/no_drift.py @@ -6,8 +6,6 @@ class NoDrift(base.DriftDetector): """Dummy class used to turn off concept drift detection capabilities of adaptive models. - - It always signals that no concept drift was detected. From d95fb9bb0b82644a51129bdf69f50d8a7f5458c4 Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Thu, 26 Oct 2023 18:26:11 -0300 Subject: [PATCH 5/7] Update river/drift/no_drift.py Co-authored-by: Max Halford --- river/drift/no_drift.py | 1 + 1 file changed, 1 insertion(+) diff --git a/river/drift/no_drift.py b/river/drift/no_drift.py index 55bcdb5bb5..4091ce7cd8 100644 --- a/river/drift/no_drift.py +++ b/river/drift/no_drift.py @@ -11,6 +11,7 @@ class NoDrift(base.DriftDetector): Examples -------- + >>> from river import drift >>> from river import evaluate >>> from river import forest From 56b708f6264691372b8e6065fca7e2c02052e8d8 Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Thu, 26 Oct 2023 18:26:20 -0300 Subject: [PATCH 6/7] Update river/drift/no_drift.py Co-authored-by: Max Halford --- river/drift/no_drift.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/river/drift/no_drift.py b/river/drift/no_drift.py index 4091ce7cd8..1762e665fc 100644 --- a/river/drift/no_drift.py +++ b/river/drift/no_drift.py @@ -7,8 +7,6 @@ class NoDrift(base.DriftDetector): """Dummy class used to turn off concept drift detection capabilities of adaptive models. It always signals that no concept drift was detected. - - Examples -------- From 2a39c42b5879c02f400850430f1355be57e9ffbb Mon Sep 17 00:00:00 2001 From: Saulo Martiello Mastelini Date: Thu, 26 Oct 2023 18:34:44 -0300 Subject: [PATCH 7/7] streamline test --- river/drift/no_drift.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/river/drift/no_drift.py b/river/drift/no_drift.py index 1762e665fc..c5173aca04 100644 --- a/river/drift/no_drift.py +++ b/river/drift/no_drift.py @@ -9,7 +9,7 @@ class NoDrift(base.DriftDetector): It always signals that no concept drift was detected. Examples -------- - + >>> from river import drift >>> from river import evaluate >>> from river import forest @@ -20,27 +20,17 @@ class NoDrift(base.DriftDetector): ... seed=8, ... position=500, ... width=40, - ... ) + ... ).take(700) We can turn off the warning detection capabilities of Adaptive Random Forest (ARF) or other similar models. Thus, the base models will reset immediately after identifying a drift, bypassing the background model building phase: - >>> model = forest.ARFClassifier( + >>> adaptive_model = forest.ARFClassifier( ... leaf_prediction="mc", ... warning_detector=drift.NoDrift(), ... seed=8 ... ) - >>> metric = metrics.Accuracy() - - >>> evaluate.progressive_val_score(dataset.take(700), model, metric) - Accuracy: 76.25% - - >>> model.n_drifts_detected() - 2 - - >>> model.n_warnings_detected() - 0 We can also turn off the concept drift handling capabilities completely: @@ -50,10 +40,22 @@ class NoDrift(base.DriftDetector): ... drift_detector=drift.NoDrift(), ... seed=8 ... ) - >>> metric = metrics.Accuracy() - >>> evaluate.progressive_val_score(dataset.take(700), stationary_model, metric) - Accuracy: 76.25% + Let's put that to test: + + >>> for x, y in dataset: + ... adaptive_model = adaptive_model.learn_one(x, y) + ... stationary_model = stationary_model.learn_one(x, y) + + The adaptive model: + + >>> adaptive_model.n_drifts_detected() + 2 + + >>> adaptive_model.n_warnings_detected() + 0 + + The stationary one: >>> stationary_model.n_drifts_detected() 0