-
-
Notifications
You must be signed in to change notification settings - Fork 118
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3007 from catalyst-cooperative/entity_matching
Update ferc-ferc plant matching with ccai implementation.
- Loading branch information
Showing
16 changed files
with
1,600 additions
and
878 deletions.
There are no files selected for viewing
31 changes: 31 additions & 0 deletions
31
migrations/versions/2e5b623ab40b_create_intermediate_steam_plants_table_.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
"""create intermediate steam plants table with plant ids | ||
Revision ID: 2e5b623ab40b | ||
Revises: 4b08158ae952 | ||
Create Date: 2023-12-19 17:37:33.476337 | ||
""" | ||
import sqlalchemy as sa | ||
from alembic import op | ||
|
||
# revision identifiers, used by Alembic. | ||
revision = '2e5b623ab40b' | ||
down_revision = '4b08158ae952' | ||
branch_labels = None | ||
depends_on = None | ||
|
||
|
||
def upgrade() -> None: | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
with op.batch_alter_table('core_ferc1__yearly_steam_plants_sched402', schema=None) as batch_op: | ||
batch_op.drop_column('plant_id_ferc1') | ||
|
||
# ### end Alembic commands ### | ||
|
||
|
||
def downgrade() -> None: | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
with op.batch_alter_table('core_ferc1__yearly_steam_plants_sched402', schema=None) as batch_op: | ||
batch_op.add_column(sa.Column('plant_id_ferc1', sa.INTEGER(), nullable=True)) | ||
|
||
# ### end Alembic commands ### |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
"""Calculates useful FERC Form 1 fuel metrics on a per plant-year basis.""" | ||
|
||
import re | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def revert_filled_in_string_nulls(df: pd.DataFrame) -> pd.DataFrame: | ||
"""Revert the filled nulls from string columns.""" | ||
for col in [ | ||
"plant_type", | ||
"construction_type", | ||
"fuel_type_code_pudl", | ||
"primary_fuel_by_cost", | ||
"primary_fuel_by_mmbtu", | ||
]: | ||
if col in df.columns: | ||
# the replace to_replace={column_name: {"", pd.NA}} mysteriously doesn't work. | ||
df[col] = df[col].replace( | ||
to_replace=[""], | ||
value=pd.NA, | ||
) | ||
return df | ||
|
||
|
||
def revert_filled_in_float_nulls(df: pd.DataFrame) -> pd.DataFrame: | ||
"""Revert the filled nulls from float columns.""" | ||
float_cols = list(df.select_dtypes(include=[float])) | ||
if float_cols: | ||
df.loc[:, float_cols] = df.loc[:, float_cols].replace(0, np.nan) | ||
return df | ||
|
||
|
||
def fuel_by_plant_ferc1( | ||
fuel_df: pd.DataFrame, fuel_categories: list[str], thresh: float = 0.5 | ||
) -> pd.DataFrame: | ||
"""Calculates useful FERC Form 1 fuel metrics on a per plant-year basis. | ||
Each record in the FERC Form 1 corresponds to a particular type of fuel. Many plants | ||
-- especially coal plants -- use more than one fuel, with gas and/or diesel serving | ||
as startup fuels. In order to be able to classify the type of plant based on | ||
relative proportions of fuel consumed or fuel costs it is useful to aggregate these | ||
per-fuel records into a single record for each plant. | ||
Fuel cost (in nominal dollars) and fuel heat content (in mmBTU) are calculated for | ||
each fuel based on the cost and heat content per unit, and the number of units | ||
consumed, and then summed by fuel type (there can be more than one record for a | ||
given type of fuel in each plant because we are simplifying the fuel categories). | ||
The per-fuel records are then pivoted to create one column per fuel type. The total | ||
is summed and stored separately, and the individual fuel costs & heat contents are | ||
divided by that total, to yield fuel proportions. Based on those proportions and a | ||
minimum threshold that's passed in, a "primary" fuel type is then assigned to the | ||
plant-year record and given a string label. | ||
Args: | ||
fuel_df: Pandas DataFrame resembling the post-transform | ||
result for the fuel_ferc1 table. | ||
thresh: A value between 0.5 and 1.0 indicating the minimum fraction of | ||
overall heat content that must have been provided by a fuel in a plant-year | ||
for it to be considered the "primary" fuel for the plant in that year. | ||
Default value: 0.5. | ||
Returns: | ||
DataFrame with a single record for each plant-year, including the columns | ||
required to merge it with the plants_steam_ferc1 table/DataFrame (report_year, | ||
utility_id_ferc1, and plant_name) as well as totals for fuel mmbtu consumed in | ||
that plant-year, and the cost of fuel in that year, the proportions of heat | ||
content and fuel costs for each fuel in that year, and a column that labels the | ||
plant's primary fuel for that year. | ||
Raises: | ||
AssertionError: If the DataFrame input does not have the columns required to | ||
run the function. | ||
""" | ||
keep_cols = [ | ||
"report_year", # key | ||
"utility_id_ferc1", # key | ||
"plant_name_ferc1", # key | ||
"fuel_type_code_pudl", # pivot | ||
"fuel_consumed_units", # value | ||
"fuel_mmbtu_per_unit", # value | ||
"fuel_cost_per_unit_burned", # value | ||
] | ||
|
||
# Ensure that the dataframe we've gotten has all the information we need: | ||
missing_cols = [col for col in keep_cols if col not in fuel_df.columns] | ||
if missing_cols: | ||
raise AssertionError( | ||
f"Required columns not found in input fuel_df: {missing_cols}" | ||
) | ||
|
||
# Calculate per-fuel derived values and add them to the DataFrame | ||
df = ( | ||
# Really there should *not* be any duplicates here but... there's a | ||
# bug somewhere that introduces them into the fuel_ferc1 table. | ||
fuel_df[keep_cols] | ||
.drop_duplicates() | ||
# Calculate totals for each record based on per-unit values: | ||
.assign(fuel_mmbtu=lambda x: x.fuel_consumed_units * x.fuel_mmbtu_per_unit) | ||
.assign(fuel_cost=lambda x: x.fuel_consumed_units * x.fuel_cost_per_unit_burned) | ||
# Drop the ratios and heterogeneous fuel "units" | ||
.drop( | ||
["fuel_mmbtu_per_unit", "fuel_cost_per_unit_burned", "fuel_consumed_units"], | ||
axis=1, | ||
) | ||
# Group by the keys and fuel type, and sum: | ||
.groupby( | ||
[ | ||
"utility_id_ferc1", | ||
"plant_name_ferc1", | ||
"report_year", | ||
"fuel_type_code_pudl", | ||
], | ||
observed=True, | ||
) | ||
.sum() | ||
.reset_index() | ||
# Set the index to the keys, and pivot to get per-fuel columns: | ||
.set_index(["utility_id_ferc1", "plant_name_ferc1", "report_year"]) | ||
.pivot(columns="fuel_type_code_pudl") | ||
.fillna(0.0) | ||
) | ||
|
||
# Undo pivot. Could refactor this old function | ||
plant_year_totals = df.stack("fuel_type_code_pudl").groupby(level=[0, 1, 2]).sum() | ||
|
||
# Calculate total heat content burned for each plant, and divide it out | ||
mmbtu_group = ( | ||
pd.merge( | ||
# Sum up all the fuel heat content, and divide the individual fuel | ||
# heat contents by it (they are all contained in single higher | ||
# level group of columns labeled fuel_mmbtu) | ||
df.loc[:, "fuel_mmbtu"].div( | ||
df.loc[:, "fuel_mmbtu"].sum(axis=1), axis="rows" | ||
), | ||
# Merge that same total into the dataframe separately as well. | ||
plant_year_totals.loc[:, "fuel_mmbtu"], | ||
right_index=True, | ||
left_index=True, | ||
) | ||
.rename(columns=lambda x: re.sub(r"$", "_fraction_mmbtu", x)) | ||
.rename(columns=lambda x: re.sub(r"_mmbtu_fraction_mmbtu$", "_mmbtu", x)) | ||
) | ||
|
||
# Calculate total fuel cost for each plant, and divide it out | ||
cost_group = ( | ||
pd.merge( | ||
# Sum up all the fuel costs, and divide the individual fuel | ||
# costs by it (they are all contained in single higher | ||
# level group of columns labeled fuel_cost) | ||
df.loc[:, "fuel_cost"].div(df.loc[:, "fuel_cost"].sum(axis=1), axis="rows"), | ||
# Merge that same total into the dataframe separately as well. | ||
plant_year_totals.loc[:, "fuel_cost"], | ||
right_index=True, | ||
left_index=True, | ||
) | ||
.rename(columns=lambda x: re.sub(r"$", "_fraction_cost", x)) | ||
.rename(columns=lambda x: re.sub(r"_cost_fraction_cost$", "_cost", x)) | ||
) | ||
|
||
# Re-unify the cost and heat content information: | ||
df = pd.merge( | ||
mmbtu_group, cost_group, left_index=True, right_index=True | ||
).reset_index() | ||
|
||
# Label each plant-year record by primary fuel: | ||
df.loc[:, ["primary_fuel_by_cost", "primary_fuel_by_mmbtu"]] = pd.NA | ||
df = df.astype( | ||
{ | ||
"primary_fuel_by_cost": pd.StringDtype(), | ||
"primary_fuel_by_mmbtu": pd.StringDtype(), | ||
} | ||
) | ||
for fuel_str in fuel_categories: | ||
try: | ||
mmbtu_mask = df[f"{fuel_str}_fraction_mmbtu"] > thresh | ||
df.loc[mmbtu_mask, "primary_fuel_by_mmbtu"] = fuel_str | ||
except KeyError: | ||
pass | ||
|
||
try: | ||
cost_mask = df[f"{fuel_str}_fraction_cost"] > thresh | ||
df.loc[cost_mask, "primary_fuel_by_cost"] = fuel_str | ||
except KeyError: | ||
pass | ||
|
||
df[["primary_fuel_by_cost", "primary_fuel_by_mmbtu"]] = df[ | ||
["primary_fuel_by_cost", "primary_fuel_by_mmbtu"] | ||
].fillna("") | ||
|
||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
"""This module impolements models for various forms of record linkage.""" | ||
from . import ( | ||
classify_plants_ferc1, | ||
embed_dataframe, | ||
link_cross_year, | ||
name_cleaner, | ||
) |
Oops, something went wrong.