Add labeled outlier dataframe output capability (#16)

* add yaml threshold input for find_outliers * rename file for compartment specificity Co-Authored-By: Jenna Tomkinson <[email protected]> * add labeled outlier dataframe output capability * add documentation and typing * Update src/cosmicqc/analyze.py Co-authored-by: Jenna Tomkinson <[email protected]> * linting * change default, add test, correct label_outliers * remove 3.8 from tests --------- Co-authored-by: Jenna Tomkinson <[email protected]>
WayScience · May 31, 2024 · 7447eac · 7447eac
1 parent 6f5fed8
commit 7447eac
Show file tree

Hide file tree

Showing 7 changed files with 392 additions and 15 deletions.
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -23,7 +23,7 @@ jobs:
   run_tests:
     strategy:
       matrix:
-        python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python_version: ["3.9", "3.10", "3.11", "3.12"]
         os: [ubuntu-22.04, macos-13]
     runs-on: ${{ matrix.os }}
     env:

diff --git a/pyproject.toml b/pyproject.toml
@@ -47,12 +47,16 @@ select = [
     # flake8-comprehensions
     "C4",
     # flake8-simplify
-    "SIM"
+    "SIM",
+    # flake8-annotations
+    "ANN"
 ]
 
 [tool.ruff.lint.per-file-ignores]
 # Ignore `E402` and `F401` (unused imports) in all `__init__.py` files
 "__init__.py" = ["E402", "F401"]
+# ignore typing rules for tests
+"tests/*" = ["ANN201"]
 
 # set dynamic versioning capabilities for project
 [tool.poetry-dynamic-versioning]

diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py
@@ -16,12 +16,12 @@
 )
 
 
-def find_outliers(
+def identify_outliers(
     df: pd.DataFrame,
-    metadata_columns: List[str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
-) -> pd.DataFrame:
+    include_threshold_scores: bool = False,
+) -> Union[pd.Series, pd.DataFrame]:
     """
     This function uses z-scoring to format the data for detecting outlier
     nuclei or cells using specific CellProfiler features. Thresholds are
@@ -47,10 +47,21 @@ def find_outliers(
             defined within a file.
 
     Returns:
-        pd.DataFrame:
-            Outlier data frame for the given conditions.
+        Union[pd.Series, pd.DataFrame]:
+            Outlier series with booleans based on whether outliers were detected
+            or not for use within other functions.
     """
 
+    # create a copy of the dataframe to ensure
+    # we don't modify the supplied dataframe inplace.
+    outlier_df = df.copy()
+
+    thresholds_name = (
+        f"outlier_{feature_thresholds}"
+        if isinstance(feature_thresholds, str)
+        else "outlier_custom"
+    )
+
     if isinstance(feature_thresholds, str):
         feature_thresholds = read_thresholds_set_from_file(
             feature_thresholds=feature_thresholds,
@@ -62,7 +73,7 @@ def find_outliers(
     for feature in feature_thresholds:
         if feature not in df.columns:
             raise ValueError(f"Feature '{feature}' does not exist in the DataFrame.")
-        df[f"Z_Score_{feature}"] = scipy_zscore(df[feature])
+        outlier_df[f"Z_Score_{feature}"] = scipy_zscore(df[feature])
         zscore_columns[feature] = f"Z_Score_{feature}"
 
     # Create outlier detection conditions for each feature
@@ -71,15 +82,77 @@ def find_outliers(
         # For positive thresholds, look for outliers that are
         # that number of std "above" the mean
         if threshold > 0:
-            condition = df[zscore_columns[feature]] > threshold
+            condition = outlier_df[zscore_columns[feature]] > threshold
         # For negative thresholds, look for outliers that are
         # that number of std "below" the mean
         else:
-            condition = df[zscore_columns[feature]] < threshold
+            condition = outlier_df[zscore_columns[feature]] < threshold
         conditions.append(condition)
 
+    return (
+        # create a boolean pd.series identifier for dataframe
+        # based on all conditions for use within other functions.
+        reduce(operator.and_, conditions)
+        if not include_threshold_scores
+        # otherwise, provide the threshold zscore col and the above column
+        else pd.concat(
+            [
+                # grab only the outlier zscore columns from the outlier_df
+                outlier_df[zscore_columns.values()],
+                pd.DataFrame({thresholds_name: reduce(operator.and_, conditions)}),
+            ],
+            axis=1,
+        )
+    )
+
+
+def find_outliers(
+    df: pd.DataFrame,
+    metadata_columns: List[str],
+    feature_thresholds: Union[Dict[str, float], str],
+    feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
+) -> pd.DataFrame:
+    """
+    This function uses identify_outliers to return a dataframe
+    with only the outliers and provided metadata columns.
+
+    Args:
+        df: pd.DataFrame
+            Data frame with converted output from CytoTable.
+        metadata_columns: List[str]
+            List of metadata columns that should be outputted with the outlier data.
+        feature_thresholds: Dict[str, float]
+            One of two options:
+            A dictionary with the feature name(s) as the key(s) and their assigned
+            threshold for identifying outliers. Positive int for the threshold
+            will detect outliers "above" than the mean, negative int will detect
+            outliers "below" the mean.
+            Or a string which is a named key reference found within
+            the feature_thresholds_file yaml file.
+        feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
+            An optional feature thresholds file where thresholds may be
+            defined within a file.
+
+    Returns:
+        pd.DataFrame:
+            Outlier data frame for the given conditions.
+    """
+
+    if isinstance(feature_thresholds, str):
+        feature_thresholds = read_thresholds_set_from_file(
+            feature_thresholds=feature_thresholds,
+            feature_thresholds_file=feature_thresholds_file,
+        )
+
     # Filter DataFrame for outliers using all conditions
-    outliers_df = df[reduce(operator.and_, conditions)]
+    outliers_df = df[
+        # use identify outliers as a mask on the full dataframe
+        identify_outliers(
+            df=df,
+            feature_thresholds=feature_thresholds,
+            feature_thresholds_file=feature_thresholds_file,
+        )
+    ]
 
     # Print outliers count and range for each feature
     print("Number of outliers:", outliers_df.shape[0])
@@ -95,9 +168,95 @@ def find_outliers(
     return outliers_df[columns_to_include]
 
 
+def label_outliers(
+    df: pd.DataFrame,
+    feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
+    feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
+    include_threshold_scores: bool = False,
+) -> pd.DataFrame:
+    """
+    Use identify_outliers to label the original dataset for
+    where a cell passed or failed the quality control condition(s).
+
+        Args:
+            df: pd.DataFrame
+                Data frame with converted output from CytoTable.
+            feature_thresholds: Dict[str, float]
+                One of two options:
+                A dictionary with the feature name(s) as the key(s) and their assigned
+                threshold for identifying outliers. Positive int for the threshold
+                will detect outliers "above" than the mean, negative int will detect
+                outliers "below" the mean.
+                Or a string which is a named key reference found within
+                the feature_thresholds_file yaml file.
+            feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
+                An optional feature thresholds file where thresholds may be
+                defined within a file.
+            include_threshold_scores: bool = False
+                Whether to include the scores in addition to whether an outlier
+                was detected or not.
+
+        Returns:
+            pd.DataFrame:
+                Full dataframe with optional scores and outlier boolean column.
+    """
+
+    # for single outlier processing
+    if isinstance(feature_thresholds, (str, dict)):
+        # return the outlier dataframe for one threshold rule
+        identified_outliers = identify_outliers(
+            df=df,
+            feature_thresholds=feature_thresholds,
+            feature_thresholds_file=feature_thresholds_file,
+            include_threshold_scores=include_threshold_scores,
+        )
+        return pd.concat(
+            [
+                df,
+                (
+                    identified_outliers
+                    if isinstance(identified_outliers, pd.DataFrame)
+                    else pd.DataFrame(
+                        {
+                            (
+                                f"outlier_{feature_thresholds}"
+                                if isinstance(feature_thresholds, str)
+                                else "outlier_custom"
+                            ): identified_outliers
+                        }
+                    )
+                ),
+            ],
+            axis=1,
+        )
+
+    # for multiple outlier processing
+    elif feature_thresholds is None:
+        # return the outlier dataframe for all threshold rules
+        labeled_df = pd.concat(
+            [df]
+            + [
+                # identify outliers for each threshold rule
+                identify_outliers(
+                    df=df,
+                    feature_thresholds=thresholds,
+                    feature_thresholds_file=feature_thresholds_file,
+                    include_threshold_scores=include_threshold_scores,
+                )
+                # loop through each threshold rule
+                for thresholds in read_thresholds_set_from_file(
+                    feature_thresholds_file=feature_thresholds_file,
+                )
+            ],
+            axis=1,
+        )
+        # return a dataframe with a deduplicated columns by name
+        return labeled_df.loc[:, ~labeled_df.columns.duplicated()]
+
+
 def read_thresholds_set_from_file(
-    feature_thresholds: str, feature_thresholds_file: str
-):
+    feature_thresholds_file: str, feature_thresholds: Optional[str] = None
+) -> Union[Dict[str, int], Dict[str, Dict[str, int]]]:
     """
     Reads a set of feature thresholds from a specified file.
 
@@ -106,10 +265,11 @@ def read_thresholds_set_from_file(
     the thresholds set from the file.
 
     Args:
-        feature_thresholds (str):
-            A string specifying the feature thresholds.
         feature_thresholds_file (str):
             The path to the file containing feature thresholds.
+        feature_thresholds (Optional str, default None):
+            A string specifying the feature thresholds.
+            If we have None, return all thresholds.
 
     Returns:
         dict: A dictionary containing the processed feature thresholds.
@@ -118,9 +278,14 @@ def read_thresholds_set_from_file(
         LookupError: If the file does not contain the specified feature_thresholds key.
     """
 
+    # open the yaml file
     with open(feature_thresholds_file, "r") as file:
         thresholds = yaml.safe_load(file)
 
+    # if no feature thresholds name is specified, return all thresholds
+    if feature_thresholds is None:
+        return thresholds["thresholds"]
+
     if feature_thresholds not in thresholds["thresholds"]:
         raise LookupError(
             (

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -16,3 +16,11 @@ def fixture_cytotable_CFReT_df():
     return pd.read_parquet(
         "tests/data/cytotable/CFRet_data/test_localhost231120090001_converted.parquet"
     )
+
+
+@pytest.fixture(name="basic_outlier_dataframe")
+def fixture_basic_outlier_dataframe():
+    """
+    Creates basic example data for use in tests
+    """
+    return pd.DataFrame({"example_feature": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
diff --git a/tests/data/coSMicQC/test_identifier_outliers_output.parquet b/tests/data/coSMicQC/test_identifier_outliers_output.parquet
diff --git a/tests/data/coSMicQC/test_label_outliers_output.parquet b/tests/data/coSMicQC/test_label_outliers_output.parquet