Skip to content

Commit

Permalink
Merge pull request #3007 from catalyst-cooperative/entity_matching
Browse files Browse the repository at this point in the history
Update ferc-ferc plant matching with ccai implementation.
  • Loading branch information
zaneselvans authored Dec 26, 2023
2 parents 0d411c3 + 962fc3d commit 5ead5b3
Show file tree
Hide file tree
Showing 16 changed files with 1,600 additions and 878 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""create intermediate steam plants table with plant ids
Revision ID: 2e5b623ab40b
Revises: 4b08158ae952
Create Date: 2023-12-19 17:37:33.476337
"""
import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = '2e5b623ab40b'
down_revision = '4b08158ae952'
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('core_ferc1__yearly_steam_plants_sched402', schema=None) as batch_op:
batch_op.drop_column('plant_id_ferc1')

# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('core_ferc1__yearly_steam_plants_sched402', schema=None) as batch_op:
batch_op.add_column(sa.Column('plant_id_ferc1', sa.INTEGER(), nullable=True))

# ### end Alembic commands ###
3 changes: 3 additions & 0 deletions src/pudl/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
from . import (
allocate_gen_fuel,
eia_ferc1_record_linkage,
eia_ferc1_train,
epacamd_eia,
fuel_by_plant,
mcoe,
plant_parts_eia,
record_linkage,
service_territory,
spatial,
state_demand,
Expand Down
762 changes: 0 additions & 762 deletions src/pudl/analysis/classify_plants_ferc1.py

This file was deleted.

192 changes: 192 additions & 0 deletions src/pudl/analysis/fuel_by_plant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""Calculates useful FERC Form 1 fuel metrics on a per plant-year basis."""

import re

import numpy as np
import pandas as pd


def revert_filled_in_string_nulls(df: pd.DataFrame) -> pd.DataFrame:
"""Revert the filled nulls from string columns."""
for col in [
"plant_type",
"construction_type",
"fuel_type_code_pudl",
"primary_fuel_by_cost",
"primary_fuel_by_mmbtu",
]:
if col in df.columns:
# the replace to_replace={column_name: {"", pd.NA}} mysteriously doesn't work.
df[col] = df[col].replace(
to_replace=[""],
value=pd.NA,
)
return df


def revert_filled_in_float_nulls(df: pd.DataFrame) -> pd.DataFrame:
"""Revert the filled nulls from float columns."""
float_cols = list(df.select_dtypes(include=[float]))
if float_cols:
df.loc[:, float_cols] = df.loc[:, float_cols].replace(0, np.nan)
return df


def fuel_by_plant_ferc1(
fuel_df: pd.DataFrame, fuel_categories: list[str], thresh: float = 0.5
) -> pd.DataFrame:
"""Calculates useful FERC Form 1 fuel metrics on a per plant-year basis.
Each record in the FERC Form 1 corresponds to a particular type of fuel. Many plants
-- especially coal plants -- use more than one fuel, with gas and/or diesel serving
as startup fuels. In order to be able to classify the type of plant based on
relative proportions of fuel consumed or fuel costs it is useful to aggregate these
per-fuel records into a single record for each plant.
Fuel cost (in nominal dollars) and fuel heat content (in mmBTU) are calculated for
each fuel based on the cost and heat content per unit, and the number of units
consumed, and then summed by fuel type (there can be more than one record for a
given type of fuel in each plant because we are simplifying the fuel categories).
The per-fuel records are then pivoted to create one column per fuel type. The total
is summed and stored separately, and the individual fuel costs & heat contents are
divided by that total, to yield fuel proportions. Based on those proportions and a
minimum threshold that's passed in, a "primary" fuel type is then assigned to the
plant-year record and given a string label.
Args:
fuel_df: Pandas DataFrame resembling the post-transform
result for the fuel_ferc1 table.
thresh: A value between 0.5 and 1.0 indicating the minimum fraction of
overall heat content that must have been provided by a fuel in a plant-year
for it to be considered the "primary" fuel for the plant in that year.
Default value: 0.5.
Returns:
DataFrame with a single record for each plant-year, including the columns
required to merge it with the plants_steam_ferc1 table/DataFrame (report_year,
utility_id_ferc1, and plant_name) as well as totals for fuel mmbtu consumed in
that plant-year, and the cost of fuel in that year, the proportions of heat
content and fuel costs for each fuel in that year, and a column that labels the
plant's primary fuel for that year.
Raises:
AssertionError: If the DataFrame input does not have the columns required to
run the function.
"""
keep_cols = [
"report_year", # key
"utility_id_ferc1", # key
"plant_name_ferc1", # key
"fuel_type_code_pudl", # pivot
"fuel_consumed_units", # value
"fuel_mmbtu_per_unit", # value
"fuel_cost_per_unit_burned", # value
]

# Ensure that the dataframe we've gotten has all the information we need:
missing_cols = [col for col in keep_cols if col not in fuel_df.columns]
if missing_cols:
raise AssertionError(
f"Required columns not found in input fuel_df: {missing_cols}"
)

# Calculate per-fuel derived values and add them to the DataFrame
df = (
# Really there should *not* be any duplicates here but... there's a
# bug somewhere that introduces them into the fuel_ferc1 table.
fuel_df[keep_cols]
.drop_duplicates()
# Calculate totals for each record based on per-unit values:
.assign(fuel_mmbtu=lambda x: x.fuel_consumed_units * x.fuel_mmbtu_per_unit)
.assign(fuel_cost=lambda x: x.fuel_consumed_units * x.fuel_cost_per_unit_burned)
# Drop the ratios and heterogeneous fuel "units"
.drop(
["fuel_mmbtu_per_unit", "fuel_cost_per_unit_burned", "fuel_consumed_units"],
axis=1,
)
# Group by the keys and fuel type, and sum:
.groupby(
[
"utility_id_ferc1",
"plant_name_ferc1",
"report_year",
"fuel_type_code_pudl",
],
observed=True,
)
.sum()
.reset_index()
# Set the index to the keys, and pivot to get per-fuel columns:
.set_index(["utility_id_ferc1", "plant_name_ferc1", "report_year"])
.pivot(columns="fuel_type_code_pudl")
.fillna(0.0)
)

# Undo pivot. Could refactor this old function
plant_year_totals = df.stack("fuel_type_code_pudl").groupby(level=[0, 1, 2]).sum()

# Calculate total heat content burned for each plant, and divide it out
mmbtu_group = (
pd.merge(
# Sum up all the fuel heat content, and divide the individual fuel
# heat contents by it (they are all contained in single higher
# level group of columns labeled fuel_mmbtu)
df.loc[:, "fuel_mmbtu"].div(
df.loc[:, "fuel_mmbtu"].sum(axis=1), axis="rows"
),
# Merge that same total into the dataframe separately as well.
plant_year_totals.loc[:, "fuel_mmbtu"],
right_index=True,
left_index=True,
)
.rename(columns=lambda x: re.sub(r"$", "_fraction_mmbtu", x))
.rename(columns=lambda x: re.sub(r"_mmbtu_fraction_mmbtu$", "_mmbtu", x))
)

# Calculate total fuel cost for each plant, and divide it out
cost_group = (
pd.merge(
# Sum up all the fuel costs, and divide the individual fuel
# costs by it (they are all contained in single higher
# level group of columns labeled fuel_cost)
df.loc[:, "fuel_cost"].div(df.loc[:, "fuel_cost"].sum(axis=1), axis="rows"),
# Merge that same total into the dataframe separately as well.
plant_year_totals.loc[:, "fuel_cost"],
right_index=True,
left_index=True,
)
.rename(columns=lambda x: re.sub(r"$", "_fraction_cost", x))
.rename(columns=lambda x: re.sub(r"_cost_fraction_cost$", "_cost", x))
)

# Re-unify the cost and heat content information:
df = pd.merge(
mmbtu_group, cost_group, left_index=True, right_index=True
).reset_index()

# Label each plant-year record by primary fuel:
df.loc[:, ["primary_fuel_by_cost", "primary_fuel_by_mmbtu"]] = pd.NA
df = df.astype(
{
"primary_fuel_by_cost": pd.StringDtype(),
"primary_fuel_by_mmbtu": pd.StringDtype(),
}
)
for fuel_str in fuel_categories:
try:
mmbtu_mask = df[f"{fuel_str}_fraction_mmbtu"] > thresh
df.loc[mmbtu_mask, "primary_fuel_by_mmbtu"] = fuel_str
except KeyError:
pass

try:
cost_mask = df[f"{fuel_str}_fraction_cost"] > thresh
df.loc[cost_mask, "primary_fuel_by_cost"] = fuel_str
except KeyError:
pass

df[["primary_fuel_by_cost", "primary_fuel_by_mmbtu"]] = df[
["primary_fuel_by_cost", "primary_fuel_by_mmbtu"]
].fillna("")

return df
7 changes: 7 additions & 0 deletions src/pudl/analysis/record_linkage/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""This module impolements models for various forms of record linkage."""
from . import (
classify_plants_ferc1,
embed_dataframe,
link_cross_year,
name_cleaner,
)
Loading

0 comments on commit 5ead5b3

Please sign in to comment.