From 07e41a0b556ea08cfd44a16a5a6ea312c9efe45a Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Wed, 15 Nov 2023 21:56:53 -0800 Subject: [PATCH] add gold-standard --- offsets_db_data/gs.py | 106 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 offsets_db_data/gs.py diff --git a/offsets_db_data/gs.py b/offsets_db_data/gs.py new file mode 100644 index 0000000..170cd48 --- /dev/null +++ b/offsets_db_data/gs.py @@ -0,0 +1,106 @@ +import ast + +import numpy as np # noqa: F401 +import pandas as pd +import pandas_flavor as pf + +from offsets_db_data.common import ( + CREDIT_SCHEMA_UPATH, + PROJECT_SCHEMA_UPATH, + load_column_mapping, + load_inverted_protocol_mapping, + load_protocol_mapping, + load_registry_project_column_mapping, +) +from offsets_db_data.credits import * # noqa: F403 +from offsets_db_data.models import credit_without_id_schema, project_schema +from offsets_db_data.projects import * # noqa: F403 + + +@pf.register_dataframe_method +def determine_gs_transaction_type(df: pd.DataFrame, *, download_type: str) -> pd.DataFrame: + transaction_type_mapping = {'issuances': 'issuance', 'retirements': 'retirement'} + df['transaction_type'] = transaction_type_mapping[download_type] + return df + + +@pf.register_dataframe_method +def add_gs_project_id_from_credits(df: pd.DataFrame) -> pd.DataFrame: + df['project'] = df['project'].apply(lambda x: x if isinstance(x, dict) else ast.literal_eval(x)) + df['project_id'] = 'GS' + df['project'].apply(lambda x: x.get('sustaincert_id', np.nan)).astype( + str + ) + return df + + +@pf.register_dataframe_method +def process_gs_credits( + df: pd.DataFrame, *, download_type: str, registry_name: str = 'gold-standard' +) -> pd.DataFrame: + df = df.copy() + column_mapping = load_column_mapping( + registry_name=registry_name, download_type=download_type, mapping_path=CREDIT_SCHEMA_UPATH + ) + + columns = {v: k for k, v in column_mapping.items()} + data = ( + df.rename(columns=columns) + .set_registry(registry_name=registry_name) + .determine_gs_transaction_type(download_type=download_type) + .add_gs_project_id_from_credits() + ) + + if download_type == 'issuances': + data = data.aggregate_issuance_transactions() + + data = data.convert_to_datetime(columns=['transaction_date']).validate( + schema=credit_without_id_schema + ) + + return data + + +@pf.register_dataframe_method +def add_gs_project_url(df: pd.DataFrame) -> pd.DataFrame: + df['project_url'] = 'https://registry.goldstandard.org/projects/details/' + df[ + 'project_id' + ].apply(str) + return df + + +@pf.register_dataframe_method +def add_gs_project_id(df: pd.DataFrame) -> pd.DataFrame: + df['project_id'] = df['project_id'].apply(lambda x: f'GS{str(x)}') + return df + + +@pf.register_dataframe_method +def process_gs_projects( + df: pd.DataFrame, *, credits: pd.DataFrame, registry_name: str = 'gold-standard' +) -> pd.DataFrame: + df = df.copy() + credits = credits.copy() + + registry_project_column_mapping = load_registry_project_column_mapping( + registry_name=registry_name, file_path=PROJECT_SCHEMA_UPATH + ) + inverted_column_mapping = {value: key for key, value in registry_project_column_mapping.items()} + protocol_mapping = load_protocol_mapping() + inverted_protocol_mapping = load_inverted_protocol_mapping() + data = ( + df.rename(columns=inverted_column_mapping) + .set_registry(registry_name=registry_name) + .add_gs_project_id() + .add_gs_project_url() + .harmonize_country_names() + .harmonize_status_codes() + .map_protocol(inverted_protocol_mapping=inverted_protocol_mapping) + .add_category(protocol_mapping=protocol_mapping) + .add_is_compliance_flag() + .add_retired_and_issued_totals(credits=credits) + .add_first_issuance_and_retirement_dates(credits=credits) + .add_missing_columns(columns=project_schema.columns.keys()) + .convert_to_datetime(columns=['listed_at']) + .validate(schema=project_schema) + ) + return data