diff --git a/.gitignore b/.gitignore index bc4bf39..d667037 100644 --- a/.gitignore +++ b/.gitignore @@ -185,5 +185,7 @@ $RECYCLE.BIN/ # End of https://www.toptal.com/developers/gitignore/api/osx,windows,linux # Requests cache directory -requests_cache/ -data_cache/ +# requests_cache/ +# data_cache/ +*params.json +True.sqlite diff --git a/pyproject.toml b/pyproject.toml index 925a25b..a607aed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,10 @@ pytest-runner = "^6.0.1" [tool.pytest.ini_options] pythonpath = ["src"] +[tool.poetry.scripts] +extract_cbioportal = "missense_kinase_toolkit.cli.extract_cbioportal:main" +transform_cbioportal = "missense_kinase_toolkit.cli.transform_cbioportal:main" + [tool.poetry-dynamic-versioning] enable = true vcs = "git" diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py index 6ff52c5..67403ef 100644 --- a/src/missense_kinase_toolkit/cbioportal.py +++ b/src/missense_kinase_toolkit/cbioportal.py @@ -1,74 +1,16 @@ #!/usr/bin/env python3 -from __future__ import annotations - import os import pandas as pd -import sys from bravado.client import SwaggerClient from bravado.requests_client import RequestsClient - -CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN" -CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE" -DATA_CACHE_DIR = "DATA_CACHE" -CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT" - - -def maybe_get_cbioportal_token_from_env( -) -> str | None: - """Get the cBioPortal token from the environment - - Returns - ------- - str | None - cBioPortal token as string if exists, otherwise None - """ - try: - token = os.environ[CBIOPORTAL_TOKEN_VAR] - except KeyError: - token = None - - return token - - -def maybe_get_cbioportal_instance_from_env( -) -> str | None: - """Get the cBioPortal instance from the environment - - Returns - ------- - str | None - cBioPortal instance as string if exists, otherwise None - """ - try: - instance = os.environ[CBIOPORTAL_INSTANCE_VAR] - except KeyError: - instance = None - - return instance - - -def maybe_get_cbioportal_cohort_from_env( -) -> str | None: - """Get the cBioPortal instance from the environment - - Returns - ------- - str | None - cBioPortal instance as string if exists, otherwise None - """ - try: - instance = os.environ[CBIOPORTAL_COHORT_VAR] - except KeyError: - print("Cohort not found in environment variables. This is necessary to run analysis. Exiting...") - sys.exit(1) - - return instance +from missense_kinase_toolkit import config, io_utils def get_all_mutations_by_study( + study_id: str, ) -> list | None: """Get mutations cBioPortal data @@ -77,19 +19,11 @@ def get_all_mutations_by_study( list | None cBioPortal data of Abstract Base Classes objects if successful, otherwise None """ - token = maybe_get_cbioportal_token_from_env() - - instance = maybe_get_cbioportal_instance_from_env() - if instance is not None: - url = f"https://{instance}/api/v2/api-docs" - else: - url = "https://cbioportal.org/api/v2/api-docs" - - # Zehir, 2017 MSKCC sequencing cohort is "msk_impact_2017" - # MSKCC clinical sequencing cohort is "mskimpact" - study_id = maybe_get_cbioportal_cohort_from_env() + instance = config.get_cbioportal_instance() + url = f"https://{instance}/api/v2/api-docs" + token = config.maybe_get_cbioportal_token() - if all(v is not None for v in (token, instance)): + if token is not None: http_client = RequestsClient() http_client.set_api_key( instance, @@ -160,42 +94,21 @@ def parse_iterabc2dataframe( dict_dir[attr].append(None) df = pd.DataFrame.from_dict(dict_dir) + df = df[sorted(df.columns.to_list())] return df -def save_cbioportal_data_to_csv( - df: pd.DataFrame, +def get_and_save_cbioportal_cohort( + study_id: str, ) -> None: - """Save cBioPortal data to a CSV file - - Parameters - ---------- - df : pd.DataFrame - Dataframe of cBioPortal data + muts = get_all_mutations_by_study(study_id) - Returns - ------- - None - """ - try: - path_data = os.environ[DATA_CACHE_DIR] - if not os.path.exists(path_data): - os.makedirs(path_data) - study_id = maybe_get_cbioportal_cohort_from_env() - df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False) - except KeyError: - print("DATA_CACHE not found in environment variables...") - - -def main(): - muts = get_all_mutations_by_study() df_muts = parse_iterabc2dataframe(muts) df_genes = parse_iterabc2dataframe(df_muts["gene"]) df_combo = pd.concat([df_muts, df_genes], axis=1) - df_combo = df_combo.drop(['gene'], axis=1) - save_cbioportal_data_to_csv(df_combo) + df_combo = df_combo.drop(["gene"], axis=1) + filename = f"{study_id}_mutations.csv" -if __name__ == "__main__": - main() + io_utils.save_dataframe_to_csv(df_combo, filename) diff --git a/src/missense_kinase_toolkit/cli/__init__.py b/src/missense_kinase_toolkit/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/missense_kinase_toolkit/cli/extract_cbioportal.py b/src/missense_kinase_toolkit/cli/extract_cbioportal.py new file mode 100755 index 0000000..31dc887 --- /dev/null +++ b/src/missense_kinase_toolkit/cli/extract_cbioportal.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +import argparse + +from missense_kinase_toolkit import config, cbioportal + +def parsearg_utils(): + parser = argparse.ArgumentParser( + description="Get mutations from cBioPortal cohort and instance" + ) + + parser.add_argument( + "--cohort", + type=str, + help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)", + default="msk_impact_2017", + ) + + parser.add_argument( + "--outDir", + type=str, + help="Required: Output directory path (str)", + ) + + parser.add_argument( + "--instance", + type=str, + help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `www.cbioportal.org` (str)", + default="www.cbioportal.org", + ) + + parser.add_argument( + "--token", + type=str, + default="", + help="Optional: cBioPortal API token (str)", + ) + + # parser.add_argument( + # "--requestsCache", + # type=str, + # default="", + # help="Optional: Requests cache (str)", + # ) + + # TODO: add logging functionality + return parser + + +def main(): + args = parsearg_utils().parse_args() + + str_studies = args.cohort + list_studies = str_studies.split(",") + list_studies = [study.strip() for study in list_studies] + + # required argument + config.set_output_dir(args.outDir) + + # optional arguments + config.set_cbioportal_instance(args.instance) + + try: + if args.token != "": + config.set_cbioportal_token(args.token) + except AttributeError: + pass + + # try: + # if args.requestsCache != "": + # config.set_request_cache(args.requestsCache) + # except AttributeError: + # pass + + for study in list_studies: + cbioportal.get_and_save_cbioportal_cohort(study) diff --git a/src/missense_kinase_toolkit/cli/transform_cbioportal.py b/src/missense_kinase_toolkit/cli/transform_cbioportal.py new file mode 100644 index 0000000..b2e815c --- /dev/null +++ b/src/missense_kinase_toolkit/cli/transform_cbioportal.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +import argparse + +from missense_kinase_toolkit import config, scrapers, io_utils + + +def parsearg_utils(): + parser = argparse.ArgumentParser( + description="Concatenate, remove duplicates, and extract genes and mutation types of interest" + ) + + parser.add_argument( + "--mutations", + type=str, + help="Optional: Mutation type(s) to extract, separated by commas (e.g., `Missense_Mutation`) (str)", + default="", + ) + + parser.add_argument( + "--outDir", + type=str, + help="Required: Output directory path (str)", + ) + + parser.add_argument( + "--requestsCache", + type=str, + default="", + help="Optional: Requests cache (str)", + ) + + # TODO: add logging functionality + return parser + + +def main(): + args = parsearg_utils().parse_args() + + str_mutations = args.mutations + list_mutations = str_mutations.split(",") + list_mutations = [mutation.strip() for mutation in list_mutations] + + # required argument + config.set_output_dir(args.outDir) + + try: + if args.requestsCache != "": + config.set_request_cache(args.requestsCache) + except AttributeError: + pass + + df_cbioportal = io_utils.concatenate_csv_files_with_glob("*_mutations.csv") + + df_kinhub = scrapers.kinhub() + io_utils.save_dataframe_to_csv(df_kinhub, "kinhub.csv") + + list_kinase_hgnc = df_kinhub["HGNC Name"].to_list() + + df_subset = df_cbioportal.loc[df_cbioportal["mutationType"].isin(list_mutations), ].reset_index(drop=True) + df_subset = df_subset.loc[df_subset["hugoGeneSymbol"].isin(list_kinase_hgnc), ].reset_index(drop=True) + + list_cols = ["HGNC Name", "UniprotID"] + df_subset_merge = df_subset.merge(df_kinhub[list_cols], + how = "left", + left_on = "hugoGeneSymbol", + right_on = "HGNC Name") + df_subset_merge = df_subset_merge.drop(["HGNC Name"], axis=1) + + io_utils.save_dataframe_to_csv(df_subset_merge, "transformed_mutations.csv") diff --git a/src/missense_kinase_toolkit/config.py b/src/missense_kinase_toolkit/config.py new file mode 100644 index 0000000..8efa52f --- /dev/null +++ b/src/missense_kinase_toolkit/config.py @@ -0,0 +1,138 @@ +import os +import sys + + +OUTPUT_DIR_VAR = "OUTPUT_DIR" +CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE" +CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN" +REQUEST_CACHE_VAR = "REQUESTS_CACHE" + + +def set_output_dir( + val: str +) -> None: + """Set the output directory in environment variables + + Parameters + ---------- + val : str + Output directory path + + Returns + ------- + None + """ + os.environ[OUTPUT_DIR_VAR] = val + + +def get_output_dir( +) -> str | None: + """Get the output directory from the environment + + Returns + ------- + str | None + Output directory path if exists, otherwise None + """ + try: + return os.environ[OUTPUT_DIR_VAR] + except KeyError: + print("Output directory not found in environment variables. This is necessary to run analysis. Exiting...") + sys.exit(1) + + +def set_cbioportal_instance( + val: str +) -> None: + """Set the cBioPortal instance in the environment variables + + Parameters + ---------- + val : str + cBioPortal instance; e.g., "cbioportal.mskcc.org" for MSKCC or + + Returns + ------- + None + """ + os.environ[CBIOPORTAL_INSTANCE_VAR] = val + + +def get_cbioportal_instance( +) -> str | None: + """Get the cBioPortal instance from the environment + + Returns + ------- + str | None + cBioPortal instance as string if exists, otherwise None + """ + try: + return os.environ[CBIOPORTAL_INSTANCE_VAR] + except KeyError: + print("cBioPortal isntance not found in environment variables. This is necessary to run analysis. Exiting...") + sys.exit(1) + + +def set_cbioportal_token( + val: str +) -> None: + """Set the cBioPortal token in the environment variables + + Parameters + ---------- + val : str + cBioPortal token + + Returns + ------- + None + """ + os.environ[CBIOPORTAL_TOKEN_VAR] = val + + +def maybe_get_cbioportal_token( +) -> str | None: + """Get the cBioPortal token from the environment + + Returns + ------- + str | None + cBioPortal token as string if exists, otherwise None + """ + try: + return os.environ[CBIOPORTAL_TOKEN_VAR] + except KeyError: + return None + + +def set_request_cache( + val: str +) -> None: + """Set the request cache path in environment variables + + Parameters + ---------- + val : str + Request cache path + + Returns + ------- + None + """ + os.environ[REQUEST_CACHE_VAR] = val + + +def maybe_get_request_cache( +) -> str | None: + """Get the request cache path from the environment + + Returns + ------- + str | None + Request cache path as string if exists, otherwise None + """ + try: + return os.environ[REQUEST_CACHE_VAR] + except KeyError: + return None diff --git a/src/missense_kinase_toolkit/hgnc.py b/src/missense_kinase_toolkit/hgnc.py index c5e616a..bc65a0e 100644 --- a/src/missense_kinase_toolkit/hgnc.py +++ b/src/missense_kinase_toolkit/hgnc.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import requests from missense_kinase_toolkit import requests_wrapper, utils_requests diff --git a/src/missense_kinase_toolkit/io_utils.py b/src/missense_kinase_toolkit/io_utils.py index 2bc0285..e6f3188 100644 --- a/src/missense_kinase_toolkit/io_utils.py +++ b/src/missense_kinase_toolkit/io_utils.py @@ -2,7 +2,25 @@ import pandas as pd -DATA_CACHE_DIR = "DATA_CACHE" +OUTPUT_DIR_VAR = "OUTPUT_DIR" + + +def check_outdir_exists( +) -> str: + """Check if OUTPUT_DIR in environmental variables and create directory if doesn't exist + + Returns + ------- + str + """ + try: + path_data = os.environ[OUTPUT_DIR_VAR] + if not os.path.exists(path_data): + os.makedirs(path_data) + except KeyError: + print("OUTPUT_DIR not found in environment variables...") + + return path_data def save_dataframe_to_csv( @@ -15,19 +33,48 @@ def save_dataframe_to_csv( ---------- df : pd.DataFrame Dataframe to save - output_path : str - Path to save the CSV file + filename : str + Filename to save (either with or without "csv" suffix) + Returns ------- None """ filename = filename.replace(".csv", "") + ".csv" + path_data = check_outdir_exists() + df.to_csv(os.path.join(path_data, filename), index=False) - try: - path_data = os.environ[DATA_CACHE_DIR] - if not os.path.exists(path_data): - os.makedirs(path_data) - df.to_csv(os.path.join(path_data, f"{filename}_mutations.csv"), index=False) - except KeyError: - print("DATA_CACHE not found in environment variables...") + +def concatenate_csv_files_with_glob( + str_find: str, +) -> pd.DataFrame: + """Use glob to find csv files to concatenate + + Parameters + ---------- + str_find: str + String to use to find files containing csv files of interest + + Return + ------ + pd.DataFrame + Concatenated dataframe + """ + import glob + + str_find = str_find.replace(".csv", "") + ".csv" + path_data = check_outdir_exists() + csv_files = glob.glob(os.path.join(path_data, str_find)) + + df_combo = pd.DataFrame() + if len(csv_files) > 0: + for csv_file in csv_files: + df = pd.read_csv(csv_file) + df_combo = pd.concat([df_combo, df]) + else: + print(f"No files matching {str_find} found in {path_data}...") + + #TODO: implement remove duplicates + + return df_combo diff --git a/src/missense_kinase_toolkit/pfam.py b/src/missense_kinase_toolkit/pfam.py index f623e7f..d398326 100644 --- a/src/missense_kinase_toolkit/pfam.py +++ b/src/missense_kinase_toolkit/pfam.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import json import pandas as pd diff --git a/src/missense_kinase_toolkit/scrapers.py b/src/missense_kinase_toolkit/scrapers.py index d36b3ae..21ca8c0 100644 --- a/src/missense_kinase_toolkit/scrapers.py +++ b/src/missense_kinase_toolkit/scrapers.py @@ -1,7 +1,10 @@ import pandas as pd -def scrape_kinhub( - url: str ='http://www.kinhub.org/kinases.html' +from missense_kinase_toolkit import requests_wrapper + + +def kinhub( + url: str = "http://www.kinhub.org/kinases.html", ) -> pd.DataFrame: """Scrape the KinHub database for kinase information @@ -24,7 +27,7 @@ def scrape_kinhub( # perhaps just write own function to clean column names # from janitor import clean_names - page = requests.get(url) + page = requests_wrapper.get_cached_session().get(url) soup = BeautifulSoup(page.content, "html.parser") list_header = [t for tr in soup.select('tr') for t in tr if t.name == 'th'] @@ -49,4 +52,10 @@ def scrape_kinhub( df_kinhub = pd.DataFrame.from_dict(dict_kinhub) # df_kinhub = clean_names(df_kinhub) - return df_kinhub + # for kinases with 2 kinase domains, entries are duplicated despite same UniProt ID + # drop these + df_kinhub_drop = df_kinhub.loc[~df_kinhub["Manning Name"].apply(lambda x: "Domain2_" in str(x)), ] + # list_uniprot = df_kinhub["UniprotID"][df_kinhub["Manning Name"].apply(lambda x: "Domain2_" in str(x))].to_list() + # assert df_kinhub.shape[0] - df_kinhub_drop.shape[0] == df_kinhub_drop["UniprotID"].isin(list_uniprot).sum() + + return df_kinhub_drop diff --git a/src/nextflow/README.MD b/src/nextflow/README.MD new file mode 100644 index 0000000..c27e1b1 --- /dev/null +++ b/src/nextflow/README.MD @@ -0,0 +1,24 @@ +# NextFlow workflow to run pipeline + +To run: `nextflow run main.nf -params-file params.json` + +Generate own `params.json` file using the following parameters: +``` +{ + "CBIOPORTAL_COHORT" : "TODO", + "OUTPUT_DIR" : "TODO", + "CBIOPORTAL_INSTANCE" : "TODO", + "CBIOPORTAL_TOKEN" : "TODO", + "REQUESTS_CACHE" : "TODO" +} +``` + +Below is a description of what each variable should contain. If variable is optional and not in use, do not create any entry in the `json` file. + +| Variable | Optional | Description | +| :--------------------| :------: | :---------- | +| 'CBIOPORTAL_COHORT' | No | cBioPortal cohort to analyze | +| 'OUTPUT_DIR' | No | Path to outdir to save data | +| 'CBIOPORTAL_INSTANCE'| Yes | `cbioportal.org` if none provided | +| 'CBIOPORTAL_TOKEN' | Yes | Data Access Token if using private instance| +| 'REQUESTS_CACHE' | Yes | Path to dir to cache requests data | diff --git a/src/nextflow/extract_cbioportal.nf b/src/nextflow/extract_cbioportal.nf new file mode 100644 index 0000000..b9462ca --- /dev/null +++ b/src/nextflow/extract_cbioportal.nf @@ -0,0 +1,16 @@ +process PROCESS_CBIOPORTAL { + input: + tuple val(cbio_cohort), path(out_dir), val(cbio_inst), val(cbio_token), path(request_cache) + + output: + path("${out_dir}/cbioportal") + """ + export PYTHONHASHSEED=0 + process_cbioportal \ + --cohort ${cbio_cohort} \ + --outDir ${out_dir} \ + --instance ${cbio_inst} \ + --token ${cbio_token} \ + --requestsCache ${request_cache} + """ +}