pymc-labs · drbenvincent · Nov 9, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,6 +10,7 @@ repos:
             exclude_types: [svg]
           - id: check-yaml
           - id: check-added-large-files
+            args: ['--maxkb=1500']
     - repo: https://github.com/pycqa/isort
       rev: 5.12.0
       hooks:

diff --git a/README.md b/README.md
@@ -144,6 +144,25 @@ Regression discontinuity designs are used when treatment is applied to units acc
 
 > The data, model fit, and counterfactual are plotted (top). Frequentist analysis shows the causal impact with the blue shaded region, but this is not shown in the Bayesian analysis to avoid a cluttered chart. Instead, the Bayesian analysis shows shaded Bayesian credible regions of the model fits. The Frequentist analysis visualises the point estimate of the causal impact, but the Bayesian analysis also plots the posterior distribution of the regression discontinuity effect (bottom).
 
+### Regression kink designs
+
+Regression discontinuity designs are used when treatment is applied to units according to a cutoff on a running variable, which is typically not time. By looking for the presence of a discontinuity at the precise point of the treatment cutoff then we can make causal claims about the potential impact of the treatment.
+
+| Running variable | Outcome |
+|-----------|-----------|
+| $x_0$     | $y_0$     |
+| $x_1$     | $y_0$     |
+| $\ldots$  | $\ldots$  |
+| $x_{N-1}$ | $y_{N-1}$ |
+| $x_N$     | $y_N$     |
+
+
+| Frequentist | Bayesian |
+|--|--|
+| coming soon | ![](docs/source/_static/regression_kink_pymc.svg) |
+
+> The data and model fit. The Bayesian analysis shows the posterior mean with credible intervals (shaded regions). We also report the Bayesian $R^2$ on the data along with the posterior mean and credible intervals of the change in gradient at the kink point.
+
 ### Interrupted time series
 
 Interrupted time series analysis is appropriate when you have a time series of observations which undergo treatment at a particular point in time. This kind of analysis has no control group and looks for the presence of a change in the outcome measure at or soon after the treatment time. Multiple predictors can be included.

diff --git a/causalpy/pymc_experiments.py b/causalpy/pymc_experiments.py
@@ -957,6 +957,198 @@
         self.print_coefficients()
 
 
+class RegressionKink(ExperimentalDesign):
+    """
+    A class to analyse sharp regression kink experiments.
+
+    :param data:
+        A pandas dataframe
+    :param formula:
+        A statistical model formula
+    :param kink_point:
+        A scalar threshold value at which there is a change in the first derivative of
+        the assignment function
+    :param model:
+        A PyMC model
+    :param running_variable_name:
+        The name of the predictor variable that the kink_point is based upon
+    :param epsilon:
+        A small scalar value which determines how far above and below the kink point to
+        evaluate the causal impact.
+    :param bandwidth:
+        Data outside of the bandwidth (relative to the discontinuity) is not used to fit
+        the model.
+    """
+
+    def __init__(
+        self,
+        data: pd.DataFrame,
+        formula: str,
+        kink_point: float,
+        model=None,
+        running_variable_name: str = "x",
+        epsilon: float = 0.001,
+        bandwidth: Optional[float] = None,
+        **kwargs,
+    ):
+        super().__init__(model=model, **kwargs)
+        self.expt_type = "Regression Kink"
+        self.data = data
+        self.formula = formula
+        self.running_variable_name = running_variable_name
+        self.kink_point = kink_point
+        self.epsilon = epsilon
+        self.bandwidth = bandwidth
+        self._input_validation()
+
+        if self.bandwidth is not None:
+            fmin = self.kink_point - self.bandwidth
+            fmax = self.kink_point + self.bandwidth
+            filtered_data = self.data.query(f"{fmin} <= x <= {fmax}")
+            if len(filtered_data) <= 10:
+                warnings.warn(
+                    f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.",  # noqa: E501
+                    UserWarning,
+                )
+            y, X = dmatrices(formula, filtered_data)
+        else:
+            y, X = dmatrices(formula, self.data)
+
+        self._y_design_info = y.design_info
+        self._x_design_info = X.design_info
+        self.labels = X.design_info.column_names
+        self.y, self.X = np.asarray(y), np.asarray(X)
+        self.outcome_variable_name = y.design_info.column_names[0]
+
+        COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
+        self.model.fit(X=self.X, y=self.y, coords=COORDS)
+
+        # score the goodness of fit to all data
+        self.score = self.model.score(X=self.X, y=self.y)
+
+        # get the model predictions of the observed data
+        if self.bandwidth is not None:
+            xi = np.linspace(fmin, fmax, 200)
+        else:
+            xi = np.linspace(
+                np.min(self.data[self.running_variable_name]),
+                np.max(self.data[self.running_variable_name]),
+                200,
+            )
+        self.x_pred = pd.DataFrame(
+            {self.running_variable_name: xi, "treated": self._is_treated(xi)}
+        )
+        # self.x_pred = pd.DataFrame({self.running_variable_name: xi})
+        (new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
+        self.pred = self.model.predict(X=np.asarray(new_x))
+
+        # Calculate the change in gradient by evaluating the function below the kink
+        # point, at the kink point, and above the kink point.
+        # NOTE: `"treated": np.array([0, 1])`` assumes treatment is applied above
+        # (not below) the threshold
+        self.x_discon = pd.DataFrame(
+            {
+                self.running_variable_name: np.array(
+                    [
+                        self.kink_point - self.epsilon,
+                        self.kink_point,
+                        self.kink_point + self.epsilon,
+                    ]
+                ),
+                "treated": np.array([0, 1, 1]),
+            }
+        )
+        (new_x,) = build_design_matrices([self._x_design_info], self.x_discon)
+        self.pred_discon = self.model.predict(X=np.asarray(new_x))
+
+        self.gradient_left = (
+            self.pred_discon["posterior_predictive"].sel(obs_ind=1)["mu"]
+            - self.pred_discon["posterior_predictive"].sel(obs_ind=0)["mu"]
+        ) / self.epsilon
+        self.gradient_right = (
+            self.pred_discon["posterior_predictive"].sel(obs_ind=2)["mu"]
+            - self.pred_discon["posterior_predictive"].sel(obs_ind=1)["mu"]
+        ) / self.epsilon
+        self.gradient_change = self.gradient_right - self.gradient_left
+
+    def _input_validation(self):
+        """Validate the input data and model formula for correctness"""
+        if "treated" not in self.formula:
+            raise FormulaException(
+                "A predictor called `treated` should be in the formula"
+            )
+
+        if _is_variable_dummy_coded(self.data["treated"]) is False:
+            raise DataException(
+                """The treated variable should be dummy coded. Consisting of 0's and 1's only."""  # noqa: E501
+            )
+
+    def _is_treated(self, x):
+        """Returns ``True`` if `x` is greater than or equal to the treatment threshold."""  # noqa: E501
+        return np.greater_equal(x, self.kink_point)
+
+    def plot(self):
+        """
+        Plot the results
+        """
+        fig, ax = plt.subplots()
+        # Plot raw data
+        sns.scatterplot(
+            self.data,
+            x=self.running_variable_name,
+            y=self.outcome_variable_name,
+            c="k",  # hue="treated",
+            ax=ax,
+        )
+
+        # Plot model fit to data
+        h_line, h_patch = plot_xY(
+            self.x_pred[self.running_variable_name],
+            self.pred["posterior_predictive"].mu,
+            ax=ax,
+            plot_hdi_kwargs={"color": "C1"},
+        )
+        handles = [(h_line, h_patch)]
+        labels = ["Posterior mean"]
+
+        # create strings to compose title
+        title_info = f"{self.score.r2:.3f} (std = {self.score.r2_std:.3f})"
+        r2 = f"Bayesian $R^2$ on all data = {title_info}"
+        percentiles = self.gradient_change.quantile([0.03, 1 - 0.03]).values
+        ci = r"$CI_{94\%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]"
+        grad_change = f"""
+            Change in gradient = {self.gradient_change.mean():.2f},
+            """
+        ax.set(title=r2 + "\n" + grad_change + ci)
+        # Intervention line
+        ax.axvline(
+            x=self.kink_point,
+            ls="-",
+            lw=3,
+            color="r",
+            label="treatment threshold",
+        )
+        ax.legend(
+            handles=(h_tuple for h_tuple in handles),
+            labels=labels,
+            fontsize=LEGEND_FONT_SIZE,
+        )
+        return (fig, ax)
+
+    def summary(self) -> None:
+        """
+        Print text output summarising the results
+        """
+
+        print(f"{self.expt_type:=^80}")
+        print(f"Formula: {self.formula}")
+        print(f"Running variable: {self.running_variable_name}")
+        print(f"Kink point on running variable: {self.kink_point}")
+        print("\nResults:")
+        print(f"Change in slope at kink point = {self.gradient_change.mean():.2f}")
+        self.print_coefficients()
+
+
 class PrePostNEGD(ExperimentalDesign):
     """
     A class to analyse data from pretest/posttest designs

diff --git a/causalpy/tests/test_integration_pymc_examples.py b/causalpy/tests/test_integration_pymc_examples.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -6,6 +7,18 @@
 sample_kwargs = {"tune": 20, "draws": 20, "chains": 2, "cores": 2}
 
 
+def reg_kink_function(x, beta, kink):
+    """Utility function for regression kink design. Returns a piecewise linear function
+    evaluated at x with a kink at kink and parameters beta"""
+    return (
+        beta[0]
+        + beta[1] * x
+        + beta[2] * x**2
+        + beta[3] * (x - kink) * (x >= kink)
+        + beta[4] * (x - kink) ** 2 * (x >= kink)
+    )
+
+
 @pytest.mark.integration
 def test_did():
     """
@@ -217,6 +230,77 @@ def test_rd_drinking():
     assert len(result.idata.posterior.coords["draw"]) == sample_kwargs["draws"]
 
 
+@pytest.mark.integration
+def test_rkink():
+    """
+    Test Regression Kink design.
+
+    Loads data and checks:
+    1. data is a dataframe
+    2. pymc_experiments.RegressionKink returns correct type
+    3. the correct number of MCMC chains exists in the posterior inference data
+    4. the correct number of MCMC draws exists in the posterior inference data
+    """
+    # define parameters for data generation
+    seed = 42
+    rng = np.random.default_rng(seed)
+    N = 50
+    kink = 0.5
+    beta = [0, -1, 0, 2, 0]
+    sigma = 0.05
+    # generate data
+    x = rng.uniform(-1, 1, N)
+    y = reg_kink_function(x, beta, kink) + rng.normal(0, sigma, N)
+    df = pd.DataFrame({"x": x, "y": y, "treated": x >= kink})
+    # run experiment
+    result = cp.pymc_experiments.RegressionKink(
+        df,
+        formula=f"y ~ 1 + x + I((x-{kink})*treated)",
+        model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
+        kink_point=kink,
+    )
+    assert isinstance(df, pd.DataFrame)
+    assert isinstance(result, cp.pymc_experiments.RegressionKink)
+    assert len(result.idata.posterior.coords["chain"]) == sample_kwargs["chains"]
+    assert len(result.idata.posterior.coords["draw"]) == sample_kwargs["draws"]
+
+
+@pytest.mark.integration
+def test_rkink_bandwidth():
+    """
+    Test Regression Kink experiment with bandwidth parameter.
+
+    Generates synthetic data and checks:
+    1. data is a dataframe
+    2. pymc_experiments.RegressionKink returns correct type
+    3. the correct number of MCMC chains exists in the posterior inference data
+    4. the correct number of MCMC draws exists in the posterior inference data
+    """
+    # define parameters for data generation
+    seed = 42
+    rng = np.random.default_rng(seed)
+    N = 50
+    kink = 0.5
+    beta = [0, -1, 0, 2, 0]
+    sigma = 0.05
+    # generate data
+    x = rng.uniform(-1, 1, N)
+    y = reg_kink_function(x, beta, kink) + rng.normal(0, sigma, N)
+    df = pd.DataFrame({"x": x, "y": y, "treated": x >= kink})
+    # run experiment
+    result = cp.pymc_experiments.RegressionKink(
+        df,
+        formula=f"y ~ 1 + x + I((x-{kink})*treated)",
+        model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
+        kink_point=kink,
+        bandwidth=0.3,
+    )
+    assert isinstance(df, pd.DataFrame)
+    assert isinstance(result, cp.pymc_experiments.RegressionKink)
+    assert len(result.idata.posterior.coords["chain"]) == sample_kwargs["chains"]
+    assert len(result.idata.posterior.coords["draw"]) == sample_kwargs["draws"]
+
+
 @pytest.mark.integration
 def test_its():
     """

diff --git a/docs/source/_static/classes.png b/docs/source/_static/classes.png
diff --git a/docs/source/_static/interrogate_badge.svg b/docs/source/_static/interrogate_badge.svg