Skip to content

Commit

Permalink
Address ruff failures and unit test failure, move analyzing code to n…
Browse files Browse the repository at this point in the history
…otebook
  • Loading branch information
e-belfer committed Jan 7, 2025
1 parent 06ae880 commit 9ac7858
Show file tree
Hide file tree
Showing 3 changed files with 220 additions and 794 deletions.
827 changes: 215 additions & 612 deletions notebooks/work-in-progress/phmsagas_distribution.ipynb

Large diffs are not rendered by default.

179 changes: 1 addition & 178 deletions src/pudl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2245,185 +2245,8 @@ def standardize_phone_column(df: pd.DataFrame, columns: list[str]) -> pd.DataFra

# Replace invalid or empty phone numbers with NaN
invalid_mask = (
(phone_main.isna())
| (phone_main.str.fullmatch(r"0+") == True)
| (phone_main == "")
(phone_main.isna()) | (phone_main.str.fullmatch(r"0+")) | (phone_main == "")
)
df[column] = df[column].mask(invalid_mask, np.nan)

return df


def analyze_missing_values(
df: pd.DataFrame, custom_missing_values: list[str] = None
) -> list[str]:
"""Analyze columns of a DataFrame for missing or invalid values.
PLEASE NOTE: No calls to this method should be included in any final
transformation scripts. This is purely for analysis and does not perform
any data transformation or cleaning.
This function checks each column for missing or custom missing values
and logs a summary of the findings for string (object), numeric, and
datetime columns.
Args:
df: The DataFrame to analyze.
custom_missing_values: Optional list of custom values to consider
as "missing" (e.g., empty strings, specific strings like "NA",
"NULL", etc.). If not provided, defaults to a standard set.
Returns:
exception_cols: List of names of columns that couldn't be analyzed
due to a caught exception.
"""
nan_cols = []
exception_cols = []

# Use a default set of custom missing values if none are provided
if custom_missing_values is None:
custom_missing_values = [
"",
" ",
"NA",
"N/A",
"NULL",
"-",
"None",
"NaN",
"?",
"*",
"#",
]

# Analyze columns for missing values
for col in df.columns:
try:
logger.info(f"Analyzing column: {col}")

# Get the column values
col_data = df[col]

# Check if the column is of string (object) type
if col_data.dtype == "object":
# Count rows where the value is NaN, None, empty string, or custom missing values
none_count = col_data.isna().sum() # Count None (NaN)
empty_string_count = (
col_data.str.strip() == ""
).sum() # Count empty strings
custom_missing_count = col_data.isin(
custom_missing_values
).sum() # Count custom missing values

total_nan_count = none_count + empty_string_count + custom_missing_count

if total_nan_count > 0:
nan_cols.append(col)

# Output counts
logger.info(f"Column '{col}' is a string type.")
if none_count > 0:
logger.warning(f"Rows with None values: {none_count}")
logger.warning(df[df[col].isna()].head())
if empty_string_count > 0:
logger.warning(f"Rows with empty strings: {empty_string_count}")
logger.warning(df[df[col].str.strip() == ""].head())
if custom_missing_count > 0:
logger.warning(
f"Rows with custom missing values: {custom_missing_count}"
)
logger.warning(df[df[col].isin(custom_missing_values)].head())
if (
none_count == 0
and empty_string_count == 0
and custom_missing_count == 0
):
logger.info("Found nothing worth reporting here")

# Check if the column is numeric (int or float)
elif pd.api.types.is_numeric_dtype(col_data):
# Count NA values in the column
na_count = col_data.isna().sum()
# Count custom missing values in numeric columns (if applicable)
custom_missing_numeric_count = col_data.isin(
[0]
).sum() # Assuming 0 is considered a missing value

if na_count > 0 or custom_missing_numeric_count > 0:
nan_cols.append(col)

# Handle the non-NA data for further analysis
col_data_cleaned = col_data.dropna()

if not col_data_cleaned.empty:
# Calculate min and max
min_val = col_data_cleaned.min()
max_val = col_data_cleaned.max()

if min_val < 0 or na_count > 0 or custom_missing_numeric_count > 0:
logger.warning(f"Min value: {min_val}")
logger.warning(f"Max value: {max_val}")
if na_count > 0:
logger.warning(f"Rows with NA values: {na_count}")
logger.warning(df[df[col].isna()].head())
if custom_missing_numeric_count > 0:
logger.warning(
f"Custom missing values (e.g., 0): {custom_missing_numeric_count}"
)
logger.warning(df[df[col].isin([0])].head())
if (
min_val > 0
and na_count == 0
and custom_missing_numeric_count == 0
):
logger.info("Found nothing worth reporting here")
else:
logger.warning(
f"Column '{col}' is numeric but contains only NA values."
)

# Check if the column is a datetime type
elif pd.api.types.is_datetime64_any_dtype(col_data):
# Count NA values in the datetime column
na_count = col_data.isna().sum()
# Assuming custom missing values might be present in string form before conversion
custom_missing_count = col_data.isin(custom_missing_values).sum()

if na_count > 0 or custom_missing_count > 0:
nan_cols.append(col)

# Handle the non-NA data for further analysis
col_data_cleaned = col_data.dropna()

if not col_data_cleaned.empty:
# Output min and max datetime values
min_date = col_data_cleaned.min()
max_date = col_data_cleaned.max()

if na_count > 0 or custom_missing_count > 0:
logger.warning(f"Min date: {min_date}")
logger.warning(f"Max date: {max_date}")
logger.warning(f"Rows with NA values: {na_count}")
logger.warning(df[df[col].isna()].head())
logger.warning(f"Custom missing values: {custom_missing_count}")
logger.warning(df[df[col].isin(custom_missing_values)].head())
if na_count == 0 and custom_missing_count == 0:
logger.info("Found nothing worth reporting here")
else:
logger.warning(
f"Column '{col}' is datetime but contains only NA values."
)

# If the column is of some other type, simply note the type
else:
logger.info(f"Column '{col}' is of type {col_data.dtype}.")

except Exception as e:
exception_cols.append(col)
logger.warning(f"Caught exception for column {col}: {e}\n")
continue

logger.info(f"Columns with NaNs or custom missing values: {nan_cols}")
logger.info(f"Columns with exceptions during processing: {exception_cols}")

return exception_cols
8 changes: 4 additions & 4 deletions test/unit/metadata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def dummy_pandera_schema():
{
"description": "test resource based on core_eia__entity_plants",
"schema": {
"fields": ["plant_id_eia", "city", "state"],
"fields": ["plant_id_eia", "city", "capacity_mw"],
"primary_key": ["plant_id_eia"],
},
"sources": ["eia860", "eia923"],
Expand All @@ -146,7 +146,7 @@ def test_resource_descriptors_can_encode_schemas(dummy_pandera_schema):
{
"plant_id_eia": [12345, 12346],
"city": ["Bloomington", "Springfield"],
"state": ["IL", "IL"],
"capacity_mw": [1.3, 1.0],
}
).pipe(apply_pudl_dtypes)
assert not dummy_pandera_schema.validate(good_dataframe).empty
Expand All @@ -166,7 +166,7 @@ def test_resource_descriptors_can_encode_schemas(dummy_pandera_schema):
{
"plant_id_eia": ["non_number"],
"city": ["Bloomington"],
"state": ["IL"],
"capacity_mw": [1.3],
}
).astype(str),
id="bad dtype",
Expand All @@ -177,7 +177,7 @@ def test_resource_descriptors_can_encode_schemas(dummy_pandera_schema):
{
"plant_id_eia": [12345, 12345],
"city": ["Bloomington", "Springfield"],
"state": ["IL", "IL"],
"capacity_mw": [1.3, 1.0],
}
).pipe(apply_pudl_dtypes),
id="duplicate PK",
Expand Down

0 comments on commit 9ac7858

Please sign in to comment.