-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathreweighing_recipe.py
195 lines (162 loc) · 8.09 KB
/
reweighing_recipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""Debiasing using reweighing"""
"""
This data recipe performs reweighing debiasing using the AIF360 package.
https://github.com/Trusted-AI/AIF360
Kamiran, F., Calders, T. Data preprocessing techniques for classification without discrimination.
Knowl Inf Syst 33, 1–33 (2012). https://doi.org/10.1007/s10115-011-0463-8
The transformer splits the original data as specified and returns training, validation, and test sets
with weights added.
1. Update the folder_path and data_file variables to indicate the location of the dataset(s).
2. validation_test_files lists additional validation or test files that need to be updated with weights.
3. validation_split indicates the percentiles at which the original data should be split to create a
validation and test set. If it's empty, no validation or test set is created. [0.7] would create
a 70/30 training/validation split. [0.7, 0.9] would create a 70/20/10 training, validation, and test split.
4. target is the name of the target column.
5. favorable_label and unfavorable_label are the socially positive and negative target value respectively.
6. protected_group_info list of lists, where each sublist contains the name of a protected column,
the unprivledged level, and the privleged level. Each of the protected columns must be binary.
7. From the Datasets section of driverless, click on ADD DATASET and then UPLOAD DATA RECIPE to upload this file.
Be sure to use the specified validation set to be used for validation when a model is trained. The weights
can cause leakage if the validation or test data is used for determining the weights.
"""
import datatable as dt
import numpy as np
import os
from h2oaicore.data import CustomData
from h2oaicore.systemutils import config
class MyReweightingData(CustomData):
_modules_needed_by_name = ["datetime", "fairlearn", "aif360"]
@staticmethod
def create_data():
import pandas as pd
from h2oaicore.models_utils import import_tensorflow
tf = import_tensorflow()
# above is because aif360 requires tensorflow
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing.reweighing import Reweighing
"""
Update the below as needed
"""
#########
#########
#########
# Path to the data
folder_path = "tmp/"
# Data file
data_file = "housing_train_proc.csv"
full_data_file = folder_path + data_file
if not os.path.isfile(full_data_file):
# for testing, just return something
if config.hard_asserts:
return dt.Frame(np.array([[1, 2, 3], [4, 5, 6]]))
else:
return []
train = pd.read_csv(full_data_file)
validation_test_files = ["housing_test_proc.csv"]
validation_split = [0.6, 0.8]
# Target column
target = "high_priced"
favorable_label = 0
unfavorable_label = 1
# Privleged_group_info = [[Protetected group name 1, prevleged level, unprivleged level], [Protetected group name 2, prevleged level, unprivleged level]]
# The protected group columns need to be binary
protected_group_info = [["hispanic", 0, 1], ["black", 0, 1]]
#########
#########
#########
# Set up protected group info
protected_groups = [group_info[0] for group_info in protected_group_info]
dataset_orig = BinaryLabelDataset(
df=train,
label_names=[target],
favorable_label=favorable_label,
unfavorable_label=unfavorable_label,
protected_attribute_names=protected_groups,
)
privileged_groups = []
unprivileged_groups = []
for protected_group in protected_group_info:
privileged_groups_dict = {}
unprivileged_groups_dict = {}
privileged_groups_dict[protected_group[0]] = protected_group[1]
unprivileged_groups_dict[protected_group[0]] = protected_group[2]
privileged_groups.append(privileged_groups_dict)
unprivileged_groups.append(unprivileged_groups_dict)
# Fit weights on the full dataset to be used on the external test set, if given
RW_full = Reweighing(
unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups
)
RW_full.fit(dataset_orig)
# Split the original data into train, validation, and test if applicable
if len(validation_split) == 1:
dataset_orig_train, dataset_orig_valid = dataset_orig.split(
validation_split, shuffle=True
)
elif len(validation_split) == 2:
dataset_orig_train_valid, dataset_orig_test = dataset_orig.split(
[validation_split[1]], shuffle=True
)
# Fit the weights on both the validation and test set for the test set split
RW_train_valid = Reweighing(
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups,
)
RW_train_valid.fit(dataset_orig_train_valid)
dataset_orig_train, dataset_orig_valid = dataset_orig_train_valid.split(
[validation_split[0] / (validation_split[1])], shuffle=True
)
else:
dataset_orig_train = dataset_orig
# Fit weights on the training set only
RW = Reweighing(
unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups
)
RW.fit(dataset_orig_train)
dataset_transf_train = RW.transform(dataset_orig_train)
# Add the weigts to the training set
train_df = pd.DataFrame(
dataset_transf_train.features, columns=dataset_transf_train.feature_names
)
train_df[target] = dataset_transf_train.labels.ravel()
train_df["weights"] = dataset_transf_train.instance_weights.ravel()
# Create datasets with minimum features calculated the given number of days ahead
dataset_dict = {}
dataset_dict[data_file.split(".")[0] + "_rw_train.csv"] = train_df
# Add weights to the validation split (if a validation split was specified)
if len(validation_split) >= 1:
dataset_transf_valid = RW.transform(dataset_orig_valid)
valid_df = pd.DataFrame(
dataset_transf_valid.features,
columns=dataset_transf_valid.feature_names,
)
valid_df[target] = dataset_transf_valid.labels.ravel()
valid_df["weights"] = dataset_transf_valid.instance_weights.ravel()
dataset_dict[data_file.split(".")[0] + "_rw_validation.csv"] = valid_df
# Add weights to the test split (if a test split was specified)
if len(validation_split) >= 2:
dataset_transf_test = RW_train_valid.transform(dataset_orig_test)
test_df = pd.DataFrame(
dataset_transf_test.features, columns=dataset_transf_test.feature_names
)
test_df[target] = dataset_transf_test.labels.ravel()
test_df["weights"] = dataset_transf_test.instance_weights.ravel()
dataset_dict[data_file.split(".")[0] + "_rw_test.csv"] = test_df
# Add weights to the test files (If provided)
for valid_file in validation_test_files:
valid = pd.read_csv(folder_path + valid_file)
dataset_valid_orig = BinaryLabelDataset(
df=valid,
label_names=[target],
favorable_label=favorable_label,
unfavorable_label=unfavorable_label,
protected_attribute_names=protected_groups,
)
dataset_transf_valid = RW_full.transform(dataset_valid_orig)
valid_df = pd.DataFrame(
dataset_transf_valid.features,
columns=dataset_transf_valid.feature_names,
)
valid_df[target] = dataset_transf_valid.labels.ravel()
valid_df["weights"] = dataset_transf_valid.instance_weights.ravel()
dataset_dict[valid_file.split(".")[0] + "_rw_transformed.csv"] = valid_df
return dataset_dict