Skip to content

Commit

Permalink
Add constituency methodology documentation (#57)
Browse files Browse the repository at this point in the history
* write data structure part of the documentation

* add constituency example part

* Edit the text and add introduction

* add math part

* edit the text of methodology

* edit titles and example code

* Fix errors in validation pages

* correct exacution error

* Add check to PR action

* Increase epochs

* Don't run all epochs in test mode

* Update calibration routine parameters

* Add pct close to EFRS

* Add minimum on PE.py

---------

Co-authored-by: Nikhil Woodruff <[email protected]>
  • Loading branch information
vahid-ahmadi and nikhilwoodruff authored Dec 23, 2024
1 parent 7c782c4 commit 6296653
Show file tree
Hide file tree
Showing 15 changed files with 5,872 additions and 132 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/pull_request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ jobs:
test:
name: Build and test
runs-on: ubuntu-latest
env:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -47,7 +49,19 @@ jobs:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
- name: Build datasets
run: make data
env:
DATA_LITE: true
- name: Run tests
run: pytest
- name: Test documentation builds
run: make documentation

- name: Check documentation build
run: |
for notebook in $(find docs/_build/jupyter_execute -name "*.ipynb"); do
if grep -q '"output_type": "error"' "$notebook"; then
echo "Error found in $notebook"
cat "$notebook"
exit 1
fi
done
2 changes: 2 additions & 0 deletions .github/workflows/push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ jobs:
test:
name: Build and test
runs-on: ubuntu-latest
env:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ test:

install:
pip install policyengine-uk
pip install policyengine
pip install policyengine>=2.4
pip install -e ".[dev]" --config-settings editable_mode=compat

download:
Expand Down
1 change: 1 addition & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ chapters:
- file: validation/constituencies.ipynb
- file: validation/local_authorities.ipynb
- file: pension_contributions.ipynb
- file: constituency_methodology.ipynb
5,697 changes: 5,697 additions & 0 deletions docs/constituency_methodology.ipynb

Large diffs are not rendered by default.

Binary file added docs/pictures/earning_dist.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/pictures/nomis_screenshot1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/pictures/parliamentary_earnings.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
148 changes: 77 additions & 71 deletions docs/validation/constituencies.ipynb

Large diffs are not rendered by default.

81 changes: 32 additions & 49 deletions docs/validation/local_authorities.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from tqdm import tqdm
import h5py
import os
from policyengine_uk_data.datasets.frs.local_areas.constituencies.transform_constituencies import (
transform_2010_to_2024,
)
Expand Down Expand Up @@ -57,6 +58,18 @@ def loss(w):

return mse_c + mse_n

def pct_close(w, t=0.1):
# Return the percentage of metrics that are within t% of the target
pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
e_c = torch.sum(torch.abs((pred_c / (1 + y) - 1)) < t)
c_c = pred_c.shape[0] * pred_c.shape[1]

pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
e_n = torch.sum(torch.abs((pred_n / (1 + y_national) - 1)) < t)
c_n = pred_n.shape[0]

return (e_c + e_n) / (c_c + c_n)

def dropout_weights(weights, p):
if p == 0:
return weights
Expand All @@ -69,16 +82,17 @@ def dropout_weights(weights, p):

optimizer = torch.optim.Adam([weights], lr=0.1)

desc = range(512)
desc = range(32) if os.environ.get("DATA_LITE") else range(256)

for epoch in desc:
optimizer.zero_grad()
weights_ = dropout_weights(weights, 0.05)
l = loss(torch.exp(weights_))
l.backward()
optimizer.step()
if epoch % 50 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}")
close = pct_close(torch.exp(weights_))
if epoch % 10 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}, Within 10%: {close:.2%}")

final_weights = torch.exp(weights).detach().numpy()
mapping_matrix = pd.read_csv(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import numpy as np
from tqdm import tqdm
import h5py
import os
from policyengine_uk_data.storage import STORAGE_FOLDER


from loss import (
from policyengine_uk_data.datasets.frs.local_areas.local_authorities.loss import (
create_local_authority_target_matrix,
create_national_target_matrix,
)
Expand Down Expand Up @@ -50,6 +51,18 @@ def loss(w):

return mse_c + mse_n

def pct_close(w, t=0.1):
# Return the percentage of metrics that are within t% of the target
pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
e_c = torch.sum(torch.abs((pred_c / (1 + y) - 1)) < t)
c_c = pred_c.shape[0] * pred_c.shape[1]

pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
e_n = torch.sum(torch.abs((pred_n / (1 + y_national) - 1)) < t)
c_n = pred_n.shape[0]

return (e_c + e_n) / (c_c + c_n)

def dropout_weights(weights, p):
if p == 0:
return weights
Expand All @@ -62,16 +75,17 @@ def dropout_weights(weights, p):

optimizer = torch.optim.Adam([weights], lr=0.1)

desc = range(512)
desc = range(32) if os.environ.get("DATA_LITE") else range(256)

for epoch in desc:
optimizer.zero_grad()
weights_ = dropout_weights(weights, 0.05)
l = loss(torch.exp(weights_))
l.backward()
optimizer.step()
if epoch % 50 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}")
close = pct_close(torch.exp(weights_))
if epoch % 10 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}, Within 10%: {close:.2%}")

if epoch % 100 == 0:
final_weights = torch.exp(weights).detach().numpy()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,5 @@ def extract_zipped_folder(folder):
repo_filename=file.name,
local_folder=file.parent,
)
print(f"Extracting {file}")
extract_zipped_folder(file)
file.unlink()
2 changes: 1 addition & 1 deletion policyengine_uk_data/utils/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ def download(
token = os.environ.get(
"HUGGING_FACE_TOKEN",
)
login(token=token)

hf_hub_download(
repo_id=repo,
repo_type="model",
filename=repo_filename,
local_dir=local_folder,
revision=version,
token=token,
)


Expand Down
14 changes: 12 additions & 2 deletions policyengine_uk_data/utils/reweight.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import torch
import os


def reweight(
Expand Down Expand Up @@ -32,6 +33,12 @@ def loss(weights):
raise ValueError("Relative error contains NaNs")
return rel_error.mean()

def pct_close(weights, t=0.1):
# Return the percentage of metrics that are within t% of the target
estimate = weights @ loss_matrix
abs_error = torch.abs((estimate - targets_array) / (1 + targets_array))
return (abs_error < t).sum() / abs_error.numel()

def dropout_weights(weights, p):
if p == 0:
return weights
Expand All @@ -47,17 +54,20 @@ def dropout_weights(weights, p):

start_loss = None

iterator = range(1_000)
iterator = range(128) if os.environ.get("DATA_LITE") else range(2048)
for i in iterator:
optimizer.zero_grad()
weights_ = dropout_weights(weights, dropout_rate)
l = loss(torch.exp(weights_))
close = pct_close(torch.exp(weights_))
if start_loss is None:
start_loss = l.item()
loss_rel_change = (l.item() - start_loss) / start_loss
l.backward()
if i % 100 == 0:
print(f"Loss: {l.item()}, Rel change: {loss_rel_change}")
print(
f"Loss: {l.item()}, Rel change: {loss_rel_change}, Epoch: {i}, Within 10%: {close:.2%}"
)
optimizer.step()

return torch.exp(weights).detach().numpy()

0 comments on commit 6296653

Please sign in to comment.