-
Notifications
You must be signed in to change notification settings - Fork 66
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add ability to analyse Regression Kink Analysis Designs #264
Changes from 20 commits
15ef4ec
81ea342
e8905f3
6b7e0e9
b4436c9
64c0b99
1aaa028
7847ede
8ce77a1
9cf6b51
6e47576
5e099c2
cc07b94
dca2844
e3c37be
2f89a97
3ca5f87
59a8b3e
0b70c9e
732707f
7e982c0
1ba0333
dab61dc
2ad1ba3
80537c6
b26a375
255ea83
232944b
4386f6b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -957,6 +957,198 @@ | |
self.print_coefficients() | ||
|
||
|
||
class RegressionKink(ExperimentalDesign): | ||
""" | ||
A class to analyse sharp regression kink experiments. | ||
|
||
:param data: | ||
A pandas dataframe | ||
:param formula: | ||
A statistical model formula | ||
:param kink_point: | ||
A scalar threshold value at which there is a change in the first derivative of | ||
the assignment function | ||
:param model: | ||
A PyMC model | ||
:param running_variable_name: | ||
The name of the predictor variable that the kink_point is based upon | ||
:param epsilon: | ||
A small scalar value which determines how far above and below the kink point to | ||
evaluate the causal impact. | ||
:param bandwidth: | ||
Data outside of the bandwidth (relative to the discontinuity) is not used to fit | ||
the model. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
data: pd.DataFrame, | ||
formula: str, | ||
kink_point: float, | ||
model=None, | ||
running_variable_name: str = "x", | ||
epsilon: float = 0.001, | ||
bandwidth: Optional[float] = None, | ||
juanitorduz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
**kwargs, | ||
): | ||
super().__init__(model=model, **kwargs) | ||
self.expt_type = "Regression Kink" | ||
self.data = data | ||
self.formula = formula | ||
self.running_variable_name = running_variable_name | ||
self.kink_point = kink_point | ||
self.epsilon = epsilon | ||
self.bandwidth = bandwidth | ||
self._input_validation() | ||
|
||
if self.bandwidth is not None: | ||
fmin = self.kink_point - self.bandwidth | ||
fmax = self.kink_point + self.bandwidth | ||
filtered_data = self.data.query(f"{fmin} <= x <= {fmax}") | ||
if len(filtered_data) <= 10: | ||
warnings.warn( | ||
f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.", # noqa: E501 | ||
UserWarning, | ||
) | ||
y, X = dmatrices(formula, filtered_data) | ||
else: | ||
y, X = dmatrices(formula, self.data) | ||
|
||
self._y_design_info = y.design_info | ||
self._x_design_info = X.design_info | ||
self.labels = X.design_info.column_names | ||
self.y, self.X = np.asarray(y), np.asarray(X) | ||
self.outcome_variable_name = y.design_info.column_names[0] | ||
|
||
COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])} | ||
self.model.fit(X=self.X, y=self.y, coords=COORDS) | ||
|
||
# score the goodness of fit to all data | ||
self.score = self.model.score(X=self.X, y=self.y) | ||
|
||
# get the model predictions of the observed data | ||
if self.bandwidth is not None: | ||
xi = np.linspace(fmin, fmax, 200) | ||
else: | ||
xi = np.linspace( | ||
np.min(self.data[self.running_variable_name]), | ||
np.max(self.data[self.running_variable_name]), | ||
200, | ||
) | ||
self.x_pred = pd.DataFrame( | ||
{self.running_variable_name: xi, "treated": self._is_treated(xi)} | ||
) | ||
# self.x_pred = pd.DataFrame({self.running_variable_name: xi}) | ||
drbenvincent marked this conversation as resolved.
Show resolved
Hide resolved
|
||
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm sure there. is a reason but why is the output wrapped in braces?and then immediately into an array? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
self.pred = self.model.predict(X=np.asarray(new_x)) | ||
|
||
# Calculate the change in gradient by evaluating the function below the kink | ||
# point, at the kink point, and above the kink point. | ||
# NOTE: `"treated": np.array([0, 1])`` assumes treatment is applied above | ||
# (not below) the threshold | ||
self.x_discon = pd.DataFrame( | ||
{ | ||
self.running_variable_name: np.array( | ||
[ | ||
self.kink_point - self.epsilon, | ||
self.kink_point, | ||
self.kink_point + self.epsilon, | ||
] | ||
), | ||
"treated": np.array([0, 1, 1]), | ||
} | ||
) | ||
(new_x,) = build_design_matrices([self._x_design_info], self.x_discon) | ||
self.pred_discon = self.model.predict(X=np.asarray(new_x)) | ||
|
||
self.gradient_left = ( | ||
self.pred_discon["posterior_predictive"].sel(obs_ind=1)["mu"] | ||
- self.pred_discon["posterior_predictive"].sel(obs_ind=0)["mu"] | ||
) / self.epsilon | ||
self.gradient_right = ( | ||
self.pred_discon["posterior_predictive"].sel(obs_ind=2)["mu"] | ||
- self.pred_discon["posterior_predictive"].sel(obs_ind=1)["mu"] | ||
) / self.epsilon | ||
self.gradient_change = self.gradient_right - self.gradient_left | ||
juanitorduz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def _input_validation(self): | ||
"""Validate the input data and model formula for correctness""" | ||
if "treated" not in self.formula: | ||
raise FormulaException( | ||
"A predictor called `treated` should be in the formula" | ||
) | ||
|
||
if _is_variable_dummy_coded(self.data["treated"]) is False: | ||
raise DataException( | ||
"""The treated variable should be dummy coded. Consisting of 0's and 1's only.""" # noqa: E501 | ||
) | ||
|
||
def _is_treated(self, x): | ||
"""Returns ``True`` if `x` is greater than or equal to the treatment threshold.""" # noqa: E501 | ||
return np.greater_equal(x, self.kink_point) | ||
|
||
def plot(self): | ||
""" | ||
Plot the results | ||
""" | ||
fig, ax = plt.subplots() | ||
# Plot raw data | ||
sns.scatterplot( | ||
self.data, | ||
x=self.running_variable_name, | ||
y=self.outcome_variable_name, | ||
c="k", # hue="treated", | ||
drbenvincent marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ax=ax, | ||
) | ||
|
||
# Plot model fit to data | ||
h_line, h_patch = plot_xY( | ||
self.x_pred[self.running_variable_name], | ||
self.pred["posterior_predictive"].mu, | ||
ax=ax, | ||
plot_hdi_kwargs={"color": "C1"}, | ||
) | ||
handles = [(h_line, h_patch)] | ||
labels = ["Posterior mean"] | ||
|
||
# create strings to compose title | ||
title_info = f"{self.score.r2:.3f} (std = {self.score.r2_std:.3f})" | ||
r2 = f"Bayesian $R^2$ on all data = {title_info}" | ||
percentiles = self.gradient_change.quantile([0.03, 1 - 0.03]).values | ||
ci = r"$CI_{94\%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]" | ||
grad_change = f""" | ||
Change in gradient = {self.gradient_change.mean():.2f}, | ||
""" | ||
ax.set(title=r2 + "\n" + grad_change + ci) | ||
# Intervention line | ||
ax.axvline( | ||
x=self.kink_point, | ||
ls="-", | ||
lw=3, | ||
color="r", | ||
label="treatment threshold", | ||
) | ||
ax.legend( | ||
handles=(h_tuple for h_tuple in handles), | ||
labels=labels, | ||
fontsize=LEGEND_FONT_SIZE, | ||
) | ||
return (fig, ax) | ||
drbenvincent marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def summary(self) -> None: | ||
""" | ||
Print text output summarising the results | ||
""" | ||
|
||
print(f"{self.expt_type:=^80}") | ||
print(f"Formula: {self.formula}") | ||
print(f"Running variable: {self.running_variable_name}") | ||
print(f"Kink point on running variable: {self.kink_point}") | ||
print("\nResults:") | ||
print(f"Change in slope at kink point = {self.gradient_change.mean():.2f}") | ||
drbenvincent marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self.print_coefficients() | ||
|
||
|
||
class PrePostNEGD(ExperimentalDesign): | ||
""" | ||
A class to analyse data from pretest/posttest designs | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
|
||
|
@@ -6,6 +7,18 @@ | |
sample_kwargs = {"tune": 20, "draws": 20, "chains": 2, "cores": 2} | ||
|
||
|
||
def reg_kink_function(x, beta, kink): | ||
"""Utility function for regression kink design. Returns a piecewise linear function | ||
evaluated at x with a kink at kink and parameters beta""" | ||
return ( | ||
beta[0] | ||
+ beta[1] * x | ||
+ beta[2] * x**2 | ||
+ beta[3] * (x - kink) * (x >= kink) | ||
+ beta[4] * (x - kink) ** 2 * (x >= kink) | ||
) | ||
|
||
|
||
@pytest.mark.integration | ||
def test_did(): | ||
""" | ||
|
@@ -217,6 +230,77 @@ def test_rd_drinking(): | |
assert len(result.idata.posterior.coords["draw"]) == sample_kwargs["draws"] | ||
|
||
|
||
@pytest.mark.integration | ||
def test_rkink(): | ||
""" | ||
Test Regression Kink design. | ||
|
||
Loads data and checks: | ||
1. data is a dataframe | ||
2. pymc_experiments.RegressionKink returns correct type | ||
3. the correct number of MCMC chains exists in the posterior inference data | ||
4. the correct number of MCMC draws exists in the posterior inference data | ||
""" | ||
# define parameters for data generation | ||
seed = 42 | ||
rng = np.random.default_rng(seed) | ||
N = 50 | ||
kink = 0.5 | ||
beta = [0, -1, 0, 2, 0] | ||
sigma = 0.05 | ||
# generate data | ||
x = rng.uniform(-1, 1, N) | ||
y = reg_kink_function(x, beta, kink) + rng.normal(0, sigma, N) | ||
df = pd.DataFrame({"x": x, "y": y, "treated": x >= kink}) | ||
# run experiment | ||
result = cp.pymc_experiments.RegressionKink( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kind of neat that the kink model calls the linear model! |
||
df, | ||
formula=f"y ~ 1 + x + I((x-{kink})*treated)", | ||
model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs), | ||
kink_point=kink, | ||
) | ||
assert isinstance(df, pd.DataFrame) | ||
assert isinstance(result, cp.pymc_experiments.RegressionKink) | ||
assert len(result.idata.posterior.coords["chain"]) == sample_kwargs["chains"] | ||
assert len(result.idata.posterior.coords["draw"]) == sample_kwargs["draws"] | ||
|
||
|
||
@pytest.mark.integration | ||
def test_rkink_bandwidth(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you have an example in the notebook with the bandwidth parameter? Maybe worth adding/explaining There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea |
||
""" | ||
Test Regression Kink experiment with bandwidth parameter. | ||
|
||
Generates synthetic data and checks: | ||
1. data is a dataframe | ||
2. pymc_experiments.RegressionKink returns correct type | ||
3. the correct number of MCMC chains exists in the posterior inference data | ||
4. the correct number of MCMC draws exists in the posterior inference data | ||
""" | ||
# define parameters for data generation | ||
seed = 42 | ||
rng = np.random.default_rng(seed) | ||
N = 50 | ||
kink = 0.5 | ||
beta = [0, -1, 0, 2, 0] | ||
sigma = 0.05 | ||
# generate data | ||
x = rng.uniform(-1, 1, N) | ||
y = reg_kink_function(x, beta, kink) + rng.normal(0, sigma, N) | ||
df = pd.DataFrame({"x": x, "y": y, "treated": x >= kink}) | ||
# run experiment | ||
result = cp.pymc_experiments.RegressionKink( | ||
df, | ||
formula=f"y ~ 1 + x + I((x-{kink})*treated)", | ||
model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs), | ||
kink_point=kink, | ||
bandwidth=0.3, | ||
) | ||
assert isinstance(df, pd.DataFrame) | ||
assert isinstance(result, cp.pymc_experiments.RegressionKink) | ||
assert len(result.idata.posterior.coords["chain"]) == sample_kwargs["chains"] | ||
assert len(result.idata.posterior.coords["draw"]) == sample_kwargs["draws"] | ||
|
||
|
||
@pytest.mark.integration | ||
def test_its(): | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know you suggested not having the notebook as a place for explaining the theory of kink designs, but i feel like the differences between tweaking at least one of epsilon/bandwidth parameters could be mentioned or shown.
It's fine if tweaking them isn't needed for your example, but it'd be good to hint at why they are there.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added an example to demonstrate use of the
bandwidth
parameter. And I've added an admonition box to explain what epsilon does.