Skip to content

Commit

Permalink
Add labeled outlier dataframe output capability (#16)
Browse files Browse the repository at this point in the history
* add yaml threshold input for find_outliers

* rename file for compartment specificity

Co-Authored-By: Jenna Tomkinson <[email protected]>

* add labeled outlier dataframe output capability

* add documentation and typing

* Update src/cosmicqc/analyze.py

Co-authored-by: Jenna Tomkinson <[email protected]>

* linting

* change default, add test, correct label_outliers

* remove 3.8 from tests

---------

Co-authored-by: Jenna Tomkinson <[email protected]>
  • Loading branch information
d33bs and jenna-tomkinson authored May 31, 2024
1 parent 6f5fed8 commit 7447eac
Show file tree
Hide file tree
Showing 7 changed files with 392 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
run_tests:
strategy:
matrix:
python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python_version: ["3.9", "3.10", "3.11", "3.12"]
os: [ubuntu-22.04, macos-13]
runs-on: ${{ matrix.os }}
env:
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,16 @@ select = [
# flake8-comprehensions
"C4",
# flake8-simplify
"SIM"
"SIM",
# flake8-annotations
"ANN"
]

[tool.ruff.lint.per-file-ignores]
# Ignore `E402` and `F401` (unused imports) in all `__init__.py` files
"__init__.py" = ["E402", "F401"]
# ignore typing rules for tests
"tests/*" = ["ANN201"]

# set dynamic versioning capabilities for project
[tool.poetry-dynamic-versioning]
Expand Down
191 changes: 178 additions & 13 deletions src/cosmicqc/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
)


def find_outliers(
def identify_outliers(
df: pd.DataFrame,
metadata_columns: List[str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
) -> pd.DataFrame:
include_threshold_scores: bool = False,
) -> Union[pd.Series, pd.DataFrame]:
"""
This function uses z-scoring to format the data for detecting outlier
nuclei or cells using specific CellProfiler features. Thresholds are
Expand All @@ -47,10 +47,21 @@ def find_outliers(
defined within a file.
Returns:
pd.DataFrame:
Outlier data frame for the given conditions.
Union[pd.Series, pd.DataFrame]:
Outlier series with booleans based on whether outliers were detected
or not for use within other functions.
"""

# create a copy of the dataframe to ensure
# we don't modify the supplied dataframe inplace.
outlier_df = df.copy()

thresholds_name = (
f"outlier_{feature_thresholds}"
if isinstance(feature_thresholds, str)
else "outlier_custom"
)

if isinstance(feature_thresholds, str):
feature_thresholds = read_thresholds_set_from_file(
feature_thresholds=feature_thresholds,
Expand All @@ -62,7 +73,7 @@ def find_outliers(
for feature in feature_thresholds:
if feature not in df.columns:
raise ValueError(f"Feature '{feature}' does not exist in the DataFrame.")
df[f"Z_Score_{feature}"] = scipy_zscore(df[feature])
outlier_df[f"Z_Score_{feature}"] = scipy_zscore(df[feature])
zscore_columns[feature] = f"Z_Score_{feature}"

# Create outlier detection conditions for each feature
Expand All @@ -71,15 +82,77 @@ def find_outliers(
# For positive thresholds, look for outliers that are
# that number of std "above" the mean
if threshold > 0:
condition = df[zscore_columns[feature]] > threshold
condition = outlier_df[zscore_columns[feature]] > threshold
# For negative thresholds, look for outliers that are
# that number of std "below" the mean
else:
condition = df[zscore_columns[feature]] < threshold
condition = outlier_df[zscore_columns[feature]] < threshold
conditions.append(condition)

return (
# create a boolean pd.series identifier for dataframe
# based on all conditions for use within other functions.
reduce(operator.and_, conditions)
if not include_threshold_scores
# otherwise, provide the threshold zscore col and the above column
else pd.concat(
[
# grab only the outlier zscore columns from the outlier_df
outlier_df[zscore_columns.values()],
pd.DataFrame({thresholds_name: reduce(operator.and_, conditions)}),
],
axis=1,
)
)


def find_outliers(
df: pd.DataFrame,
metadata_columns: List[str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
) -> pd.DataFrame:
"""
This function uses identify_outliers to return a dataframe
with only the outliers and provided metadata columns.
Args:
df: pd.DataFrame
Data frame with converted output from CytoTable.
metadata_columns: List[str]
List of metadata columns that should be outputted with the outlier data.
feature_thresholds: Dict[str, float]
One of two options:
A dictionary with the feature name(s) as the key(s) and their assigned
threshold for identifying outliers. Positive int for the threshold
will detect outliers "above" than the mean, negative int will detect
outliers "below" the mean.
Or a string which is a named key reference found within
the feature_thresholds_file yaml file.
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
An optional feature thresholds file where thresholds may be
defined within a file.
Returns:
pd.DataFrame:
Outlier data frame for the given conditions.
"""

if isinstance(feature_thresholds, str):
feature_thresholds = read_thresholds_set_from_file(
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
)

# Filter DataFrame for outliers using all conditions
outliers_df = df[reduce(operator.and_, conditions)]
outliers_df = df[
# use identify outliers as a mask on the full dataframe
identify_outliers(
df=df,
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
)
]

# Print outliers count and range for each feature
print("Number of outliers:", outliers_df.shape[0])
Expand All @@ -95,9 +168,95 @@ def find_outliers(
return outliers_df[columns_to_include]


def label_outliers(
df: pd.DataFrame,
feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
) -> pd.DataFrame:
"""
Use identify_outliers to label the original dataset for
where a cell passed or failed the quality control condition(s).
Args:
df: pd.DataFrame
Data frame with converted output from CytoTable.
feature_thresholds: Dict[str, float]
One of two options:
A dictionary with the feature name(s) as the key(s) and their assigned
threshold for identifying outliers. Positive int for the threshold
will detect outliers "above" than the mean, negative int will detect
outliers "below" the mean.
Or a string which is a named key reference found within
the feature_thresholds_file yaml file.
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
An optional feature thresholds file where thresholds may be
defined within a file.
include_threshold_scores: bool = False
Whether to include the scores in addition to whether an outlier
was detected or not.
Returns:
pd.DataFrame:
Full dataframe with optional scores and outlier boolean column.
"""

# for single outlier processing
if isinstance(feature_thresholds, (str, dict)):
# return the outlier dataframe for one threshold rule
identified_outliers = identify_outliers(
df=df,
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
include_threshold_scores=include_threshold_scores,
)
return pd.concat(
[
df,
(
identified_outliers
if isinstance(identified_outliers, pd.DataFrame)
else pd.DataFrame(
{
(
f"outlier_{feature_thresholds}"
if isinstance(feature_thresholds, str)
else "outlier_custom"
): identified_outliers
}
)
),
],
axis=1,
)

# for multiple outlier processing
elif feature_thresholds is None:
# return the outlier dataframe for all threshold rules
labeled_df = pd.concat(
[df]
+ [
# identify outliers for each threshold rule
identify_outliers(
df=df,
feature_thresholds=thresholds,
feature_thresholds_file=feature_thresholds_file,
include_threshold_scores=include_threshold_scores,
)
# loop through each threshold rule
for thresholds in read_thresholds_set_from_file(
feature_thresholds_file=feature_thresholds_file,
)
],
axis=1,
)
# return a dataframe with a deduplicated columns by name
return labeled_df.loc[:, ~labeled_df.columns.duplicated()]


def read_thresholds_set_from_file(
feature_thresholds: str, feature_thresholds_file: str
):
feature_thresholds_file: str, feature_thresholds: Optional[str] = None
) -> Union[Dict[str, int], Dict[str, Dict[str, int]]]:
"""
Reads a set of feature thresholds from a specified file.
Expand All @@ -106,10 +265,11 @@ def read_thresholds_set_from_file(
the thresholds set from the file.
Args:
feature_thresholds (str):
A string specifying the feature thresholds.
feature_thresholds_file (str):
The path to the file containing feature thresholds.
feature_thresholds (Optional str, default None):
A string specifying the feature thresholds.
If we have None, return all thresholds.
Returns:
dict: A dictionary containing the processed feature thresholds.
Expand All @@ -118,9 +278,14 @@ def read_thresholds_set_from_file(
LookupError: If the file does not contain the specified feature_thresholds key.
"""

# open the yaml file
with open(feature_thresholds_file, "r") as file:
thresholds = yaml.safe_load(file)

# if no feature thresholds name is specified, return all thresholds
if feature_thresholds is None:
return thresholds["thresholds"]

if feature_thresholds not in thresholds["thresholds"]:
raise LookupError(
(
Expand Down
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,11 @@ def fixture_cytotable_CFReT_df():
return pd.read_parquet(
"tests/data/cytotable/CFRet_data/test_localhost231120090001_converted.parquet"
)


@pytest.fixture(name="basic_outlier_dataframe")
def fixture_basic_outlier_dataframe():
"""
Creates basic example data for use in tests
"""
return pd.DataFrame({"example_feature": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 7447eac

Please sign in to comment.