Address ruff failures and unit test failure, move analyzing code to n…

…otebook
catalyst-cooperative · Jan 7, 2025 · 9ac7858 · 9ac7858
1 parent 06ae880
commit 9ac7858
Show file tree

Hide file tree

Showing 3 changed files with 220 additions and 794 deletions.
diff --git a/notebooks/work-in-progress/phmsagas_distribution.ipynb b/notebooks/work-in-progress/phmsagas_distribution.ipynb
diff --git a/src/pudl/helpers.py b/src/pudl/helpers.py
@@ -2245,185 +2245,8 @@ def standardize_phone_column(df: pd.DataFrame, columns: list[str]) -> pd.DataFra
 
         # Replace invalid or empty phone numbers with NaN
         invalid_mask = (
-            (phone_main.isna())
-            | (phone_main.str.fullmatch(r"0+") == True)
-            | (phone_main == "")
+            (phone_main.isna()) | (phone_main.str.fullmatch(r"0+")) | (phone_main == "")
         )
         df[column] = df[column].mask(invalid_mask, np.nan)
 
     return df
-
-
-def analyze_missing_values(
-    df: pd.DataFrame, custom_missing_values: list[str] = None
-) -> list[str]:
-    """Analyze columns of a DataFrame for missing or invalid values.
-
-    PLEASE NOTE: No calls to this method should be included in any final
-    transformation scripts. This is purely for analysis and does not perform
-    any data transformation or cleaning.
-
-    This function checks each column for missing or custom missing values
-    and logs a summary of the findings for string (object), numeric, and
-    datetime columns.
-
-    Args:
-        df: The DataFrame to analyze.
-        custom_missing_values: Optional list of custom values to consider
-            as "missing" (e.g., empty strings, specific strings like "NA",
-            "NULL", etc.). If not provided, defaults to a standard set.
-
-    Returns:
-        exception_cols: List of names of columns that couldn't be analyzed
-            due to a caught exception.
-    """
-    nan_cols = []
-    exception_cols = []
-
-    # Use a default set of custom missing values if none are provided
-    if custom_missing_values is None:
-        custom_missing_values = [
-            "",
-            " ",
-            "NA",
-            "N/A",
-            "NULL",
-            "-",
-            "None",
-            "NaN",
-            "?",
-            "*",
-            "#",
-        ]
-
-    # Analyze columns for missing values
-    for col in df.columns:
-        try:
-            logger.info(f"Analyzing column: {col}")
-
-            # Get the column values
-            col_data = df[col]
-
-            # Check if the column is of string (object) type
-            if col_data.dtype == "object":
-                # Count rows where the value is NaN, None, empty string, or custom missing values
-                none_count = col_data.isna().sum()  # Count None (NaN)
-                empty_string_count = (
-                    col_data.str.strip() == ""
-                ).sum()  # Count empty strings
-                custom_missing_count = col_data.isin(
-                    custom_missing_values
-                ).sum()  # Count custom missing values
-
-                total_nan_count = none_count + empty_string_count + custom_missing_count
-
-                if total_nan_count > 0:
-                    nan_cols.append(col)
-
-                # Output counts
-                logger.info(f"Column '{col}' is a string type.")
-                if none_count > 0:
-                    logger.warning(f"Rows with None values: {none_count}")
-                    logger.warning(df[df[col].isna()].head())
-                if empty_string_count > 0:
-                    logger.warning(f"Rows with empty strings: {empty_string_count}")
-                    logger.warning(df[df[col].str.strip() == ""].head())
-                if custom_missing_count > 0:
-                    logger.warning(
-                        f"Rows with custom missing values: {custom_missing_count}"
-                    )
-                    logger.warning(df[df[col].isin(custom_missing_values)].head())
-                if (
-                    none_count == 0
-                    and empty_string_count == 0
-                    and custom_missing_count == 0
-                ):
-                    logger.info("Found nothing worth reporting here")
-
-            # Check if the column is numeric (int or float)
-            elif pd.api.types.is_numeric_dtype(col_data):
-                # Count NA values in the column
-                na_count = col_data.isna().sum()
-                # Count custom missing values in numeric columns (if applicable)
-                custom_missing_numeric_count = col_data.isin(
-                    [0]
-                ).sum()  # Assuming 0 is considered a missing value
-
-                if na_count > 0 or custom_missing_numeric_count > 0:
-                    nan_cols.append(col)
-
-                # Handle the non-NA data for further analysis
-                col_data_cleaned = col_data.dropna()
-
-                if not col_data_cleaned.empty:
-                    # Calculate min and max
-                    min_val = col_data_cleaned.min()
-                    max_val = col_data_cleaned.max()
-
-                    if min_val < 0 or na_count > 0 or custom_missing_numeric_count > 0:
-                        logger.warning(f"Min value: {min_val}")
-                        logger.warning(f"Max value: {max_val}")
-                    if na_count > 0:
-                        logger.warning(f"Rows with NA values: {na_count}")
-                        logger.warning(df[df[col].isna()].head())
-                    if custom_missing_numeric_count > 0:
-                        logger.warning(
-                            f"Custom missing values (e.g., 0): {custom_missing_numeric_count}"
-                        )
-                        logger.warning(df[df[col].isin([0])].head())
-                    if (
-                        min_val > 0
-                        and na_count == 0
-                        and custom_missing_numeric_count == 0
-                    ):
-                        logger.info("Found nothing worth reporting here")
-                else:
-                    logger.warning(
-                        f"Column '{col}' is numeric but contains only NA values."
-                    )
-
-            # Check if the column is a datetime type
-            elif pd.api.types.is_datetime64_any_dtype(col_data):
-                # Count NA values in the datetime column
-                na_count = col_data.isna().sum()
-                # Assuming custom missing values might be present in string form before conversion
-                custom_missing_count = col_data.isin(custom_missing_values).sum()
-
-                if na_count > 0 or custom_missing_count > 0:
-                    nan_cols.append(col)
-
-                # Handle the non-NA data for further analysis
-                col_data_cleaned = col_data.dropna()
-
-                if not col_data_cleaned.empty:
-                    # Output min and max datetime values
-                    min_date = col_data_cleaned.min()
-                    max_date = col_data_cleaned.max()
-
-                    if na_count > 0 or custom_missing_count > 0:
-                        logger.warning(f"Min date: {min_date}")
-                        logger.warning(f"Max date: {max_date}")
-                        logger.warning(f"Rows with NA values: {na_count}")
-                        logger.warning(df[df[col].isna()].head())
-                        logger.warning(f"Custom missing values: {custom_missing_count}")
-                        logger.warning(df[df[col].isin(custom_missing_values)].head())
-                    if na_count == 0 and custom_missing_count == 0:
-                        logger.info("Found nothing worth reporting here")
-                else:
-                    logger.warning(
-                        f"Column '{col}' is datetime but contains only NA values."
-                    )
-
-            # If the column is of some other type, simply note the type
-            else:
-                logger.info(f"Column '{col}' is of type {col_data.dtype}.")
-
-        except Exception as e:
-            exception_cols.append(col)
-            logger.warning(f"Caught exception for column {col}: {e}\n")
-            continue
-
-    logger.info(f"Columns with NaNs or custom missing values: {nan_cols}")
-    logger.info(f"Columns with exceptions during processing: {exception_cols}")
-
-    return exception_cols
diff --git a/test/unit/metadata_test.py b/test/unit/metadata_test.py
@@ -125,7 +125,7 @@ def dummy_pandera_schema():
         {
             "description": "test resource based on core_eia__entity_plants",
             "schema": {
-                "fields": ["plant_id_eia", "city", "state"],
+                "fields": ["plant_id_eia", "city", "capacity_mw"],
                 "primary_key": ["plant_id_eia"],
             },
             "sources": ["eia860", "eia923"],
@@ -146,7 +146,7 @@ def test_resource_descriptors_can_encode_schemas(dummy_pandera_schema):
         {
             "plant_id_eia": [12345, 12346],
             "city": ["Bloomington", "Springfield"],
-            "state": ["IL", "IL"],
+            "capacity_mw": [1.3, 1.0],
         }
     ).pipe(apply_pudl_dtypes)
     assert not dummy_pandera_schema.validate(good_dataframe).empty
@@ -166,7 +166,7 @@ def test_resource_descriptors_can_encode_schemas(dummy_pandera_schema):
                 {
                     "plant_id_eia": ["non_number"],
                     "city": ["Bloomington"],
-                    "state": ["IL"],
+                    "capacity_mw": [1.3],
                 }
             ).astype(str),
             id="bad dtype",
@@ -177,7 +177,7 @@ def test_resource_descriptors_can_encode_schemas(dummy_pandera_schema):
                 {
                     "plant_id_eia": [12345, 12345],
                     "city": ["Bloomington", "Springfield"],
-                    "state": ["IL", "IL"],
+                    "capacity_mw": [1.3, 1.0],
                 }
             ).pipe(apply_pudl_dtypes),
             id="duplicate PK",