diff --git a/poetry.lock b/poetry.lock index 59a6c67..bd4dc3d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -275,6 +275,66 @@ files = [ {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, ] +[[package]] +name = "pyyaml" +version = "6.0.1" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, + {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, + {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, + {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, + {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, + {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"}, + {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"}, + {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, + {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, + {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, + {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, + {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, + {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, +] + [[package]] name = "scipy" version = "1.13.0" @@ -353,4 +413,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "b46f295b19af518175d4fa2ef637d2a9e1a073d5c35c2a4779b6e14a4f122297" +content-hash = "69964da2d2e246bd8dcc2ca64a859456f0cc675041ed322c734b3910990aa0b0" diff --git a/pyproject.toml b/pyproject.toml index 8f59353..6a73b0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ python = ">=3.9,<3.13" pandas = "^2.2.2" scipy = "^1.13.0" pyarrow = "^16.0.0" +pyyaml = "^6.0.1" [tool.poetry.group.dev.dependencies] pytest = "^8.2.0" diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index 759027b..0c41cdd 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -3,15 +3,24 @@ """ import operator +import pathlib from functools import reduce -from typing import Dict, List +from typing import Dict, List, Optional, Union import pandas as pd +import yaml from scipy.stats import zscore as scipy_zscore +DEFAULT_QC_THRESHOLD_FILE = ( + f"{pathlib.Path(__file__).parent!s}/data/qc_nuclei_thresholds_default.yml" +) + def find_outliers( - df: pd.DataFrame, feature_thresholds: Dict[str, float], metadata_columns: List[str] + df: pd.DataFrame, + metadata_columns: List[str], + feature_thresholds: Union[Dict[str, float], str], + feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, ) -> pd.DataFrame: """ This function uses z-scoring to format the data for detecting outlier @@ -23,18 +32,31 @@ def find_outliers( Args: df: pd.DataFrame Data frame with converted output from CytoTable. + metadata_columns: List[str] + List of metadata columns that should be outputted with the outlier data. feature_thresholds: Dict[str, float] - Dictionary with the feature name(s) as the key(s) and their assigned + One of two options: + A dictionary with the feature name(s) as the key(s) and their assigned threshold for identifying outliers. Positive int for the threshold will detect outliers "above" than the mean, negative int will detect outliers "below" the mean. - metadata_columns: List[str] - List of metadata columns that should be outputted with the outlier data. + Or a string which is a named key reference found within + the feature_thresholds_file yaml file. + feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, + An optional feature thresholds file where thresholds may be + defined within a file. Returns: pd.DataFrame: Outlier data frame for the given conditions. """ + + if isinstance(feature_thresholds, str): + feature_thresholds = read_thresholds_set_from_file( + feature_thresholds=feature_thresholds, + feature_thresholds_file=feature_thresholds_file, + ) + # Create z-score columns for each feature to reference during outlier detection zscore_columns = {} for feature in feature_thresholds: @@ -71,3 +93,40 @@ def find_outliers( # Return outliers DataFrame with specified columns return outliers_df[columns_to_include] + + +def read_thresholds_set_from_file( + feature_thresholds: str, feature_thresholds_file: str +): + """ + Reads a set of feature thresholds from a specified file. + + This function takes the path to a feature thresholds file and a + specific feature threshold string, reads the file, and returns + the thresholds set from the file. + + Args: + feature_thresholds (str): + A string specifying the feature thresholds. + feature_thresholds_file (str): + The path to the file containing feature thresholds. + + Returns: + dict: A dictionary containing the processed feature thresholds. + + Raises: + LookupError: If the file does not contain the specified feature_thresholds key. + """ + + with open(feature_thresholds_file, "r") as file: + thresholds = yaml.safe_load(file) + + if feature_thresholds not in thresholds["thresholds"]: + raise LookupError( + ( + f"Unable to find threshold set by name {feature_thresholds}" + f" within {feature_thresholds_file}" + ) + ) + + return thresholds["thresholds"][feature_thresholds] diff --git a/src/cosmicqc/data/qc_nuclei_thresholds_default.yml b/src/cosmicqc/data/qc_nuclei_thresholds_default.yml new file mode 100644 index 0000000..b312018 --- /dev/null +++ b/src/cosmicqc/data/qc_nuclei_thresholds_default.yml @@ -0,0 +1,16 @@ +# defines threshold sets for running qc procedures as part of this project. +versions: + cellprofiler: ">=4.2.4" +thresholds: + # Set a negative threshold to identify both outlier small nuclei + # and low formfactor representing non-circular segmentations. + small_and_low_formfactor_nuclei: + Nuclei_AreaShape_Area: -1 + Nuclei_AreaShape_FormFactor: -1 + # find very elongated nuclei segmentations (above mean) + elongated_nuclei: + Nuclei_AreaShape_Eccentricity: 2 + # find large nuclei segmentations (above mean) and low formfactor + large_nuclei: + Nuclei_AreaShape_Area: 2 + Nuclei_AreaShape_FormFactor: -2 diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 93103e1..49f949b 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -3,6 +3,7 @@ """ import pandas as pd +import pytest from cosmicqc import analyze @@ -183,3 +184,99 @@ def test_find_outliers_cfret(cytotable_CFReT_data_df: pd.DataFrame): 14811: "f01", }, } + + +def test_read_thresholds_set_from_file(): + """ + Tests read_thresholds_set_from_file + """ + + # test that an exception is raised on receiving a bad + # lookup value from the thresholds file. + with pytest.raises(LookupError): + analyze.read_thresholds_set_from_file( + feature_thresholds="bad_lookup_value", + feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE, + ) + + # test default threshold sets + assert analyze.read_thresholds_set_from_file( + feature_thresholds="small_and_low_formfactor_nuclei", + feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE, + ) == {"Nuclei_AreaShape_Area": -1, "Nuclei_AreaShape_FormFactor": -1} + + assert analyze.read_thresholds_set_from_file( + feature_thresholds="elongated_nuclei", + feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE, + ) == {"Nuclei_AreaShape_Eccentricity": 2} + + assert analyze.read_thresholds_set_from_file( + feature_thresholds="large_nuclei", + feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE, + ) == {"Nuclei_AreaShape_Area": 2, "Nuclei_AreaShape_FormFactor": -2} + + +def test_find_outliers_dict_and_default_config_cfret( + cytotable_CFReT_data_df: pd.DataFrame, +): + """ + Testing find_outliers with dictionary vs yaml threshold sets + using CytoTable CFReT data. + """ + + # metadata columns to include in output data frame + metadata_columns = [ + "Image_Metadata_Plate", + "Image_Metadata_Well", + "Image_Metadata_Site", + ] + + # test that the output is the same from dict vs yaml + pd.testing.assert_frame_equal( + analyze.find_outliers( + df=cytotable_CFReT_data_df, + feature_thresholds={ + "Nuclei_AreaShape_Area": -1, + "Nuclei_AreaShape_FormFactor": -1, + }, + metadata_columns=metadata_columns, + ), + analyze.find_outliers( + df=cytotable_CFReT_data_df, + feature_thresholds="small_and_low_formfactor_nuclei", + metadata_columns=metadata_columns, + ), + ) + + # test that the output is the same from dict vs yaml + pd.testing.assert_frame_equal( + analyze.find_outliers( + df=cytotable_CFReT_data_df, + feature_thresholds={ + "Nuclei_AreaShape_Eccentricity": 2, + }, + metadata_columns=metadata_columns, + ), + analyze.find_outliers( + df=cytotable_CFReT_data_df, + feature_thresholds="elongated_nuclei", + metadata_columns=metadata_columns, + ), + ) + + # test that the output is the same from dict vs yaml + pd.testing.assert_frame_equal( + analyze.find_outliers( + df=cytotable_CFReT_data_df, + feature_thresholds={ + "Nuclei_AreaShape_Area": 2, + "Nuclei_AreaShape_FormFactor": -2, + }, + metadata_columns=metadata_columns, + ), + analyze.find_outliers( + df=cytotable_CFReT_data_df, + feature_thresholds="large_nuclei", + metadata_columns=metadata_columns, + ), + )