reformatted cbioportal pipeline for nextflow compatibility (#13)

* reformatted cbioportal pipeline for nextflow compatibility * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor in progress changes in process_cbioportal.nf * added shebang to process_cbioportal.py * updated cbioportal scripts to conform to NF ETL pipeline * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * interim commit for transform_cbioportal resolved conflicts in cbioportal.py * added transform_cbioportal CLI code * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * .gitignore and pyproject.toml changes for transform_cbioportal CLI changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
choderalab · Apr 5, 2024 · 000fab3 · 000fab3
1 parent 1f1e326
commit 000fab3
Show file tree

Hide file tree

Showing 13 changed files with 415 additions and 120 deletions.
diff --git a/.gitignore b/.gitignore
@@ -185,5 +185,7 @@ $RECYCLE.BIN/
 # End of https://www.toptal.com/developers/gitignore/api/osx,windows,linux
 
 # Requests cache directory
-requests_cache/
-data_cache/
+# requests_cache/
+# data_cache/
+*params.json
+True.sqlite
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,10 @@ pytest-runner = "^6.0.1"
 [tool.pytest.ini_options]
 pythonpath = ["src"]
 
+[tool.poetry.scripts]
+extract_cbioportal = "missense_kinase_toolkit.cli.extract_cbioportal:main"
+transform_cbioportal = "missense_kinase_toolkit.cli.transform_cbioportal:main"
+
 [tool.poetry-dynamic-versioning]
 enable = true
 vcs = "git"

diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py
@@ -1,74 +1,16 @@
 #!/usr/bin/env python3
 
-from __future__ import annotations
-
 import os
 import pandas as pd
-import sys
 
 from bravado.client import SwaggerClient
 from bravado.requests_client import RequestsClient
 
-
-CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
-CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
-DATA_CACHE_DIR = "DATA_CACHE"
-CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT"
-
-
-def maybe_get_cbioportal_token_from_env(
-) -> str | None:
-    """Get the cBioPortal token from the environment
-
-    Returns
-    -------
-    str | None
-        cBioPortal token as string if exists, otherwise None
-    """
-    try:
-        token = os.environ[CBIOPORTAL_TOKEN_VAR]
-    except KeyError:
-        token = None
-
-    return token
-
-
-def maybe_get_cbioportal_instance_from_env(
-) -> str | None:
-    """Get the cBioPortal instance from the environment
-
-    Returns
-    -------
-    str | None
-        cBioPortal instance as string if exists, otherwise None
-    """
-    try:
-        instance = os.environ[CBIOPORTAL_INSTANCE_VAR]
-    except KeyError:
-        instance = None
-
-    return instance
-
-
-def maybe_get_cbioportal_cohort_from_env(
-) -> str | None:
-    """Get the cBioPortal instance from the environment
-
-    Returns
-    -------
-    str | None
-        cBioPortal instance as string if exists, otherwise None
-    """
-    try:
-        instance = os.environ[CBIOPORTAL_COHORT_VAR]
-    except KeyError:
-        print("Cohort not found in environment variables. This is necessary to run analysis. Exiting...")
-        sys.exit(1)
-
-    return instance
+from missense_kinase_toolkit import config, io_utils
 
 
 def get_all_mutations_by_study(
+    study_id: str,
 ) -> list | None:
     """Get mutations  cBioPortal data
 
@@ -77,19 +19,11 @@ def get_all_mutations_by_study(
     list | None
         cBioPortal data of Abstract Base Classes objects if successful, otherwise None
     """
-    token = maybe_get_cbioportal_token_from_env()
-
-    instance = maybe_get_cbioportal_instance_from_env()
-    if instance is not None:
-        url = f"https://{instance}/api/v2/api-docs"
-    else:
-        url = "https://cbioportal.org/api/v2/api-docs"
-
-    # Zehir, 2017 MSKCC sequencing cohort is "msk_impact_2017"
-    # MSKCC clinical sequencing cohort is "mskimpact"
-    study_id = maybe_get_cbioportal_cohort_from_env()
+    instance = config.get_cbioportal_instance()
+    url = f"https://{instance}/api/v2/api-docs"
+    token = config.maybe_get_cbioportal_token()
 
-    if all(v is not None for v in (token, instance)):
+    if token is not None:
         http_client = RequestsClient()
         http_client.set_api_key(
             instance,
@@ -160,42 +94,21 @@ def parse_iterabc2dataframe(
                 dict_dir[attr].append(None)
 
     df = pd.DataFrame.from_dict(dict_dir)
+    df = df[sorted(df.columns.to_list())]
 
     return df
 
 
-def save_cbioportal_data_to_csv(
-    df: pd.DataFrame,
+def get_and_save_cbioportal_cohort(
+    study_id: str,
 ) -> None:
-    """Save cBioPortal data to a CSV file
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Dataframe of cBioPortal data
+    muts = get_all_mutations_by_study(study_id)
 
-    Returns
-    -------
-    None
-    """
-    try:
-        path_data = os.environ[DATA_CACHE_DIR]
-        if not os.path.exists(path_data):
-            os.makedirs(path_data)
-        study_id = maybe_get_cbioportal_cohort_from_env()
-        df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
-    except KeyError:
-        print("DATA_CACHE not found in environment variables...")
-
-
-def main():
-    muts = get_all_mutations_by_study()
     df_muts = parse_iterabc2dataframe(muts)
     df_genes = parse_iterabc2dataframe(df_muts["gene"])
     df_combo = pd.concat([df_muts, df_genes], axis=1)
-    df_combo = df_combo.drop(['gene'], axis=1)
-    save_cbioportal_data_to_csv(df_combo)
+    df_combo = df_combo.drop(["gene"], axis=1)
 
+    filename = f"{study_id}_mutations.csv"
 
-if __name__ == "__main__":
-    main()
+    io_utils.save_dataframe_to_csv(df_combo, filename)
diff --git a/src/missense_kinase_toolkit/cli/__init__.py b/src/missense_kinase_toolkit/cli/__init__.py
diff --git a/src/missense_kinase_toolkit/cli/extract_cbioportal.py b/src/missense_kinase_toolkit/cli/extract_cbioportal.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+import argparse
+
+from missense_kinase_toolkit import config, cbioportal
+
+def parsearg_utils():
+    parser = argparse.ArgumentParser(
+        description="Get mutations from cBioPortal cohort and instance"
+    )
+
+    parser.add_argument(
+        "--cohort",
+        type=str,
+        help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)",
+        default="msk_impact_2017",
+    )
+
+    parser.add_argument(
+        "--outDir",
+        type=str,
+        help="Required: Output directory path (str)",
+    )
+
+    parser.add_argument(
+        "--instance",
+        type=str,
+        help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `www.cbioportal.org` (str)",
+        default="www.cbioportal.org",
+    )
+
+    parser.add_argument(
+        "--token",
+        type=str,
+        default="",
+        help="Optional: cBioPortal API token (str)",
+    )
+
+    # parser.add_argument(
+    #     "--requestsCache",
+    #     type=str,
+    #     default="",
+    #     help="Optional: Requests cache (str)",
+    # )
+
+    # TODO: add logging functionality
+    return parser
+
+
+def main():
+    args = parsearg_utils().parse_args()
+
+    str_studies = args.cohort
+    list_studies = str_studies.split(",")
+    list_studies = [study.strip() for study in list_studies]
+
+    # required argument
+    config.set_output_dir(args.outDir)
+
+    # optional arguments
+    config.set_cbioportal_instance(args.instance)
+
+    try:
+        if args.token != "":
+            config.set_cbioportal_token(args.token)
+    except AttributeError:
+        pass
+
+    # try:
+    #     if args.requestsCache != "":
+    #         config.set_request_cache(args.requestsCache)
+    # except AttributeError:
+    #     pass
+
+    for study in list_studies:
+        cbioportal.get_and_save_cbioportal_cohort(study)
diff --git a/src/missense_kinase_toolkit/cli/transform_cbioportal.py b/src/missense_kinase_toolkit/cli/transform_cbioportal.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import argparse
+
+from missense_kinase_toolkit import config, scrapers, io_utils
+
+
+def parsearg_utils():
+    parser = argparse.ArgumentParser(
+        description="Concatenate, remove duplicates, and extract genes and mutation types of interest"
+    )
+
+    parser.add_argument(
+        "--mutations",
+        type=str,
+        help="Optional: Mutation type(s) to extract, separated by commas (e.g., `Missense_Mutation`) (str)",
+        default="",
+    )
+
+    parser.add_argument(
+        "--outDir",
+        type=str,
+        help="Required: Output directory path (str)",
+    )
+
+    parser.add_argument(
+        "--requestsCache",
+        type=str,
+        default="",
+        help="Optional: Requests cache (str)",
+    )
+
+    # TODO: add logging functionality
+    return parser
+
+
+def main():
+    args = parsearg_utils().parse_args()
+
+    str_mutations = args.mutations
+    list_mutations = str_mutations.split(",")
+    list_mutations = [mutation.strip() for mutation in list_mutations]
+
+    # required argument
+    config.set_output_dir(args.outDir)
+
+    try:
+        if args.requestsCache != "":
+            config.set_request_cache(args.requestsCache)
+    except AttributeError:
+        pass
+
+    df_cbioportal = io_utils.concatenate_csv_files_with_glob("*_mutations.csv")
+
+    df_kinhub = scrapers.kinhub()
+    io_utils.save_dataframe_to_csv(df_kinhub, "kinhub.csv")
+
+    list_kinase_hgnc = df_kinhub["HGNC Name"].to_list()
+
+    df_subset = df_cbioportal.loc[df_cbioportal["mutationType"].isin(list_mutations), ].reset_index(drop=True)
+    df_subset = df_subset.loc[df_subset["hugoGeneSymbol"].isin(list_kinase_hgnc), ].reset_index(drop=True)
+
+    list_cols = ["HGNC Name", "UniprotID"]
+    df_subset_merge = df_subset.merge(df_kinhub[list_cols],
+                                      how = "left",
+                                      left_on = "hugoGeneSymbol",
+                                      right_on = "HGNC Name")
+    df_subset_merge = df_subset_merge.drop(["HGNC Name"], axis=1)
+
+    io_utils.save_dataframe_to_csv(df_subset_merge, "transformed_mutations.csv")