diff --git a/causalml/dataset/__init__.py b/causalml/dataset/__init__.py index 242b0dbc..f4678b11 100644 --- a/causalml/dataset/__init__.py +++ b/causalml/dataset/__init__.py @@ -5,6 +5,7 @@ from .regression import simulate_unrelated_treatment_control from .regression import simulate_hidden_confounder from .classification import make_uplift_classification +from .classification import make_uplift_classification_logistic from .synthetic import get_synthetic_preds, get_synthetic_preds_holdout from .synthetic import get_synthetic_summary, get_synthetic_summary_holdout diff --git a/causalml/dataset/classification.py b/causalml/dataset/classification.py index db5f3095..0ac0bb93 100644 --- a/causalml/dataset/classification.py +++ b/causalml/dataset/classification.py @@ -1,8 +1,426 @@ +import random import numpy as np import pandas as pd from sklearn.datasets import make_classification +from scipy.optimize import fsolve +#------ Define a list of functions for feature transformation +# @staticmethod +def _f_linear(x): + """ + Linear transformation (actually identical transformation) + """ + return np.array(x) + +# @staticmethod +def _f_quadratic(x): + """ + Quadratic transformation + """ + return np.array(x) * np.array(x) + +# @staticmethod +def _f_cubic(x): + """ + Quadratic transformation + """ + return np.array(x) * np.array(x) * np.array(x) + +# @staticmethod +def _f_relu(x): + """ + Relu transformation + """ + x = np.array(x) + return np.maximum(x, 0) + +# @staticmethod +def _f_sin(x): + """ + Sine transformation + """ + return np.sin(np.array(x)*np.pi) + +# @staticmethod +def _f_cos(x): + """ + Cosine transformation + """ + return np.cos(np.array(x)*np.pi) + +#------ Generating non-linear splines as feature transformation functions +# @staticmethod +def _generate_splines( + n_functions=10, n_initial_points=10, s=0.01, + x_min=-3, x_max=3, y_min=0, y_max=1, + random_seed=2019 + ): + """ + Generate a list of spline functions for feature + transformation. + + Parameters + ---------- + n_functions : int, optional + Number of spline functions to be created. + n_initial_points: int, optional + Number of initial random points to be placed on a 2D plot to fit a spline. + s: float or None, optional + Positive smoothing factor used to choose the number of knots (arg in scipy.interpolate.UnivariateSpline). + x_min: int or float, optional + The minimum value of the X range. + x_max: int or float, optional + The maximum value of the X range. + y_min: int or float, optional + The minimum value of the Y range. + y_max: int or float, optional + The maxium value of the Y range. + random_seed: int, optional + Random seed. + + Returns + ------- + spls: list + List of spline functions. + """ + np.random.seed(random_seed) + spls = [] + for i in range(n_functions): + x = np.linspace(x_min, x_max, n_initial_points) + y = np.random.uniform(y_min,y_max,n_initial_points) + spl = UnivariateSpline(x, y, s=s) + spls.append(spl) + return spls + + +# @staticmethod +def _standardize( x): + """ + Standardize a vector to be mean 0 and std 1. + """ + return (np.array(x) - np.mean(x)) / np.std(x) + +# @staticmethod +def _fixed_transformation(fs, x, f_index=0): + """ + Transform and standardize a vector by a transformation function. + If the given index is within the function list f_index < len(fs), then use fs[f_index] as the transformation function + otherwise, randomly choose a function from the function list. + Parameters + ---------- + fs : list + A collection of functions for transformation. + x : list + Feature values to be transformed. + f_index : int, optional + The function index to be used to select a transformation function. + """ + if f_index 0): + x_name_uplift_transformed = [] + x_name_uplift = [] + for xi in range(n_uplift_dict[treatment_key_i]): + # observed feature + x = np.random.normal(0, 1, df1.shape[0]) + x_name_i = 'x' + str(len(x_name)+1) + '_uplift' + x_name.append(x_name_i) + x_name_uplift.append(x_name_i) + df1[x_name_i] = x + # transformed feature that takes effect in the model + x_name_i = x_name_i + '_transformed' + if random_select_association: + df1[x_name_i] = _fixed_transformation(f_list, x, random.randint(0,len(f_list)-1)) + else: + df1[x_name_i] = _fixed_transformation(f_list, x, xi%len(f_list)) + x_name_uplift_transformed.append(x_name_i) + x_name_uplift_transformed_dict[treatment_key_i] = x_name_uplift_transformed + + # generate mixed informative and uplift features + for treatment_key_i in treatment_name: + if treatment_key_i in n_mix_informative_uplift_dict and n_mix_informative_uplift_dict[treatment_key_i] >0: + for xi in range(n_mix_informative_uplift_dict[treatment_key_i]): + x_name_i = 'x' + str(len(x_name)+1) + '_mix' + x_name.append(x_name_i) + p_weight = np.random.uniform(0, 1) + df1[x_name_i] = (p_weight * df1[np.random.choice(x_informative_name)] + + (1-p_weight) * df1[np.random.choice(x_name_uplift)]) + + # generate conversion probability ------------------------------------------------# + # baseline conversion + coef_classify = [] + for ci in range(n_classification_informative): + rcoef = [0] + while np.abs(rcoef) < 0.1: + rcoef = np.random.randn(1) * np.sqrt(1./n_classification_informative) + coef_classify.append(rcoef[0]) + x_classify = df1[x_informative_transformed].values + p1 = positive_class_proportion + a10 = np.log(p1/(1.-p1)) + err = np.random.normal(0, error_std, df1.shape[0]) + xb_array = (x_classify * coef_classify).sum(axis=1) + err + # solve for the constant value so that the output metric mean equal to the function input positive_class_proportion + a1 = fsolve(_softmax,a10,args=(p1,xb_array))[0] + df1['conversion_prob_linear'] = a1 + xb_array + df1['control_conversion_prob_linear'] = df1['conversion_prob_linear'].values + + # uplift conversion + for treatment_key_i in treatment_name: + if treatment_key_i in delta_uplift_dict and np.abs(delta_uplift_dict[treatment_key_i]) > 0.: + treatment_index = ( + df1.index[df1['treatment_group_key'] == treatment_key_i].tolist() + ) + # coefficient + coef_uplift = [] + for ci in range(n_uplift_dict[treatment_key_i]): + #rcoef = [0] + #while np.abs(rcoef) < 0.1: + # rcoef = np.random.uniform(-1, 1, 1) + rcoef = [0.5] + coef_uplift.append(rcoef[0]) + x_uplift = df1.loc[:,x_name_uplift_transformed_dict[treatment_key_i]].values + p2 = mean_dict[treatment_key_i] + a20 = np.log(p2/(1.- p2)) - a1 + xb_array = df1['conversion_prob_linear'].values + (x_uplift * coef_uplift).sum(axis=1) + xb_array_treatment = xb_array[treatment_index] + a2 = fsolve(_softmax,a20,args=(p2,xb_array_treatment))[0] + df1['%s_conversion_prob_linear'%(treatment_key_i)] = a2 + xb_array + df1.loc[treatment_index,'conversion_prob_linear'] = df1.loc[treatment_index,'%s_conversion_prob_linear'%(treatment_key_i)].values + else: + df1['%s_conversion_prob_linear'%(treatment_key_i)] = df1['conversion_prob_linear'].values + + + + + # generate conversion probability and true treatment effect ---------------------------------# + df1['conversion_prob'] = 1 / (1 + np.exp(- df1['conversion_prob_linear'].values)) + df1['control_conversion_prob'] = 1 / (1 + np.exp(- df1['control_conversion_prob_linear'].values)) + for treatment_key_i in treatment_name: + df1['%s_conversion_prob'%(treatment_key_i)] = 1 / (1 + np.exp(- df1['%s_conversion_prob_linear'%(treatment_key_i)].values)) + df1['%s_true_effect'%(treatment_key_i)] = df1['%s_conversion_prob'%(treatment_key_i)].values - df1['control_conversion_prob'].values + + # generate Y ------------------------------------------------------------# + df1['conversion_prob'] = [max(0, min(1, xi)) + for xi in df1['conversion_prob'].values] + Y1 = np.random.binomial(1, df1['conversion_prob'].values) + + df1[y_name] = Y1 + + return df1, x_name + + +# ------ Data generation function (V1) using make_classification from sklearn def make_uplift_classification(n_samples=1000, treatment_name=['control', 'treatment1', 'treatment2', 'treatment3'], y_name='conversion', @@ -181,3 +599,8 @@ def make_uplift_classification(n_samples=1000, df_res[y_name] = Y df_res['treatment_effect'] = Y - Y1 return df_res, x_name + + + + + diff --git a/examples/logistic_regression_based_data_generation_for_uplift_classification.ipynb b/examples/logistic_regression_based_data_generation_for_uplift_classification.ipynb new file mode 100644 index 00000000..df668dd0 --- /dev/null +++ b/examples/logistic_regression_based_data_generation_for_uplift_classification.ipynb @@ -0,0 +1,518 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Logistic Regression Based Data Generation Function for Uplift Classification Problem\n", + "This Data Generation Function uses Logistic Regression as the underlying data generation model.\n", + "This function enables better control of feature patterns: how feature is associated with outcome baseline and treatment effect. It enables 6 differernt patterns: Linear, Quadratic, Cubic, Relu, Sine, and Cosine. \n", + "\n", + "This notebook shows how to use this data generation function to generate data, with a visualization of the feature patterns.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import Data Generation Function" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The sklearn.utils.testing module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.\n" + ] + } + ], + "source": [ + "from causalml.dataset import make_uplift_classification_logistic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate Data" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "df, feature_name = make_uplift_classification_logistic( n_samples=100000,\n", + " treatment_name=['control', 'treatment1', 'treatment2', 'treatment3'],\n", + " y_name='conversion',\n", + " n_classification_features=10,\n", + " n_classification_informative=5,\n", + " n_classification_redundant=0,\n", + " n_classification_repeated=0,\n", + " n_uplift_dict={'treatment1': 2, 'treatment2': 2, 'treatment3': 3},\n", + " n_mix_informative_uplift_dict={'treatment1': 1, 'treatment2': 1, 'treatment3': 0},\n", + " delta_uplift_dict={'treatment1': 0.05, 'treatment2': 0.02, 'treatment3': -0.05},\n", + " feature_association_list = ['linear','quadratic','cubic','relu','sin','cos'],\n", + " random_select_association = False,\n", + " random_seed=20200416\n", + " \n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
treatment_group_keyx1_informativex1_informative_transformedx2_informativex2_informative_transformedx3_informativex3_informative_transformedx4_informativex4_informative_transformedx5_informative...conversion_probcontrol_conversion_probcontrol_true_effecttreatment1_conversion_probtreatment1_true_effecttreatment2_conversion_probtreatment2_true_effecttreatment3_conversion_probtreatment3_true_effectconversion
0treatment1-0.194205-0.1920431.7914081.5726090.6780280.080696-0.169306-0.683035-1.837155...0.1267700.0761380.00.1267700.0506320.0875450.0114070.029396-0.0467420
1treatment1-0.898070-0.8944620.252125-0.663393-0.842844-0.156004-0.047769-0.683035-0.251752...0.0642780.0707990.00.064278-0.0065220.1010760.0302770.050778-0.0200210
2treatment10.7010020.7013250.239320-0.6678671.7007661.278676-0.734568-0.683035-1.130113...0.0184800.0149470.00.0184800.0035340.0180550.0031090.0193270.0043800
3control-1.653684-1.648524-0.119123-0.698492-0.037645-0.0003550.6874290.495943-1.427400...0.1027990.1027990.00.101410-0.0013900.040230-0.0625690.030753-0.0720460
4treatment31.0579091.057498-2.0195232.190564-0.950180-0.223370-1.505741-0.683035-0.399457...0.0129640.1062410.00.1713090.0650680.1145260.0082850.012964-0.0932770
\n", + "

5 rows × 47 columns

\n", + "
" + ], + "text/plain": [ + " treatment_group_key x1_informative x1_informative_transformed \\\n", + "0 treatment1 -0.194205 -0.192043 \n", + "1 treatment1 -0.898070 -0.894462 \n", + "2 treatment1 0.701002 0.701325 \n", + "3 control -1.653684 -1.648524 \n", + "4 treatment3 1.057909 1.057498 \n", + "\n", + " x2_informative x2_informative_transformed x3_informative \\\n", + "0 1.791408 1.572609 0.678028 \n", + "1 0.252125 -0.663393 -0.842844 \n", + "2 0.239320 -0.667867 1.700766 \n", + "3 -0.119123 -0.698492 -0.037645 \n", + "4 -2.019523 2.190564 -0.950180 \n", + "\n", + " x3_informative_transformed x4_informative x4_informative_transformed \\\n", + "0 0.080696 -0.169306 -0.683035 \n", + "1 -0.156004 -0.047769 -0.683035 \n", + "2 1.278676 -0.734568 -0.683035 \n", + "3 -0.000355 0.687429 0.495943 \n", + "4 -0.223370 -1.505741 -0.683035 \n", + "\n", + " x5_informative ... conversion_prob control_conversion_prob \\\n", + "0 -1.837155 ... 0.126770 0.076138 \n", + "1 -0.251752 ... 0.064278 0.070799 \n", + "2 -1.130113 ... 0.018480 0.014947 \n", + "3 -1.427400 ... 0.102799 0.102799 \n", + "4 -0.399457 ... 0.012964 0.106241 \n", + "\n", + " control_true_effect treatment1_conversion_prob treatment1_true_effect \\\n", + "0 0.0 0.126770 0.050632 \n", + "1 0.0 0.064278 -0.006522 \n", + "2 0.0 0.018480 0.003534 \n", + "3 0.0 0.101410 -0.001390 \n", + "4 0.0 0.171309 0.065068 \n", + "\n", + " treatment2_conversion_prob treatment2_true_effect \\\n", + "0 0.087545 0.011407 \n", + "1 0.101076 0.030277 \n", + "2 0.018055 0.003109 \n", + "3 0.040230 -0.062569 \n", + "4 0.114526 0.008285 \n", + "\n", + " treatment3_conversion_prob treatment3_true_effect conversion \n", + "0 0.029396 -0.046742 0 \n", + "1 0.050778 -0.020021 0 \n", + "2 0.019327 0.004380 0 \n", + "3 0.030753 -0.072046 0 \n", + "4 0.012964 -0.093277 0 \n", + "\n", + "[5 rows x 47 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "['x1_informative', 'x2_informative', 'x3_informative', 'x4_informative', 'x5_informative', 'x6_irrelevant', 'x7_irrelevant', 'x8_irrelevant', 'x9_irrelevant', 'x10_irrelevant', 'x11_uplift', 'x12_uplift', 'x13_uplift', 'x14_uplift', 'x15_uplift', 'x16_uplift', 'x17_uplift', 'x18_mix', 'x19_mix']" + ], + "text/plain": [ + "['x1_informative',\n", + " 'x2_informative',\n", + " 'x3_informative',\n", + " 'x4_informative',\n", + " 'x5_informative',\n", + " 'x6_irrelevant',\n", + " 'x7_irrelevant',\n", + " 'x8_irrelevant',\n", + " 'x9_irrelevant',\n", + " 'x10_irrelevant',\n", + " 'x11_uplift',\n", + " 'x12_uplift',\n", + " 'x13_uplift',\n", + " 'x14_uplift',\n", + " 'x15_uplift',\n", + " 'x16_uplift',\n", + " 'x17_uplift',\n", + " 'x18_mix',\n", + " 'x19_mix']" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Experiment Group Mean" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "treatment_group_key\n", + "control 0.09896\n", + "treatment1 0.15088\n", + "treatment2 0.12042\n", + "treatment3 0.04972\n", + "Name: conversion, dtype: float64" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby(['treatment_group_key'])['conversion'].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualize Feature Pattern" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "treatment_group_key\n", + "control 0.09896\n", + "treatment1 0.15088\n", + "Name: conversion, dtype: float64" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Extract control and treatment1 for illustration\n", + "treatment_group_keys = ['control','treatment1']\n", + "y_name='conversion'\n", + "df1 = df[df['treatment_group_key'].isin(treatment_group_keys)].reset_index(drop=True)\n", + "df1.groupby(['treatment_group_key'])['conversion'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "color_dict = {'control':'#2471a3','treatment1':'#FF5733','treatment2':'#5D6D7E'\n", + " ,'treatment3':'#34495E','treatment4':'#283747'}\n", + "\n", + "hatch_dict = {'control':'','treatment1':'//'}\n", + "\n", + "x_name_plot = ['x11_uplift', 'x12_uplift', 'x2_informative', 'x5_informative']\n", + "\n", + "x_new_name_plot = ['Uplift Feature 1', 'Uplift Feature 2', 'Classification Feature 1','Classification Feature 2']\n", + "opacity = 0.8\n", + "\n", + "plt.figure(figsize=(20, 3))\n", + "subplot_list = [141,142,143,144]\n", + "counter = 0\n", + "bar_width = 0.9/len(treatment_group_keys)\n", + "for x_name_i in x_name_plot:\n", + " bins = np.percentile(df1[x_name_i].values, np.linspace(0, 100, 11))[:-1]\n", + " df1['x_bin'] = np.digitize(df1[x_name_i].values, bins)\n", + " df_gb = df1.groupby(['treatment_group_key','x_bin'],as_index=False)[y_name].mean()\n", + " plt.subplot(subplot_list[counter])\n", + " for ti in range(len(treatment_group_keys)):\n", + " x_index = [ti * bar_width - len(treatment_group_keys)/2*bar_width + xi for xi in range(10)]\n", + " plt.bar(x_index, \n", + " df_gb[df_gb['treatment_group_key']==treatment_group_keys[ti]][y_name].values, \n", + " bar_width,\n", + " alpha=opacity,\n", + " color=color_dict[treatment_group_keys[ti]],\n", + " hatch = hatch_dict[treatment_group_keys[ti]],\n", + " label=treatment_group_keys[ti]\n", + " )\n", + " plt.xticks(range(10), [int(xi+10) for xi in np.linspace(0, 100, 11)[:-1]])\n", + " plt.xlabel(x_new_name_plot[counter],fontsize=16)\n", + " plt.ylabel('Conversion',fontsize=16)\n", + " #plt.title(x_name_i)\n", + " if counter == 0:\n", + " plt.legend(treatment_group_keys, loc=2,fontsize=16)\n", + " plt.ylim([0.,0.3])\n", + " counter+=1\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the figure above, Uplift Feature 1 has a linear pattern on treatment effect, Uplift Feature 2 has a quadratic pattern on treatment effect, Classification Feature 1 has a quadratic pattern on baseline for both treatment and control, and Classification Feature 2 has a Sine pattern on baseline for both treatment and control." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}