From a300939b4a39fec91a848412d62f8e1d602a009d Mon Sep 17 00:00:00 2001 From: Jess White <50890758+jessicaw9910@users.noreply.github.com> Date: Fri, 5 Apr 2024 15:50:21 -0400 Subject: [PATCH] Nextflow (#14) * reformatted cbioportal pipeline for nextflow compatibility * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor in progress changes in process_cbioportal.nf * added shebang to process_cbioportal.py * updated cbioportal scripts to conform to NF ETL pipeline * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * interim commit for transform_cbioportal resolved conflicts in cbioportal.py * added transform_cbioportal CLI code * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * .gitignore and pyproject.toml changes for transform_cbioportal CLI changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated NF README * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated cli and pfam scripts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../cli/extract_cbioportal.py | 14 +- .../cli/transform_cbioportal.py | 15 +- src/missense_kinase_toolkit/config.py | 5 +- src/missense_kinase_toolkit/io_utils.py | 4 +- src/missense_kinase_toolkit/pfam.py | 205 ++++++------------ src/nextflow/README.MD | 14 +- 6 files changed, 85 insertions(+), 172 deletions(-) mode change 100644 => 100755 src/missense_kinase_toolkit/cli/transform_cbioportal.py diff --git a/src/missense_kinase_toolkit/cli/extract_cbioportal.py b/src/missense_kinase_toolkit/cli/extract_cbioportal.py index 31dc887..c5df5ea 100755 --- a/src/missense_kinase_toolkit/cli/extract_cbioportal.py +++ b/src/missense_kinase_toolkit/cli/extract_cbioportal.py @@ -36,14 +36,8 @@ def parsearg_utils(): help="Optional: cBioPortal API token (str)", ) - # parser.add_argument( - # "--requestsCache", - # type=str, - # default="", - # help="Optional: Requests cache (str)", - # ) - # TODO: add logging functionality + # TODO: cache requests for cBioPortal API return parser @@ -66,11 +60,5 @@ def main(): except AttributeError: pass - # try: - # if args.requestsCache != "": - # config.set_request_cache(args.requestsCache) - # except AttributeError: - # pass - for study in list_studies: cbioportal.get_and_save_cbioportal_cohort(study) diff --git a/src/missense_kinase_toolkit/cli/transform_cbioportal.py b/src/missense_kinase_toolkit/cli/transform_cbioportal.py old mode 100644 new mode 100755 index b2e815c..d8e51be --- a/src/missense_kinase_toolkit/cli/transform_cbioportal.py +++ b/src/missense_kinase_toolkit/cli/transform_cbioportal.py @@ -14,7 +14,7 @@ def parsearg_utils(): "--mutations", type=str, help="Optional: Mutation type(s) to extract, separated by commas (e.g., `Missense_Mutation`) (str)", - default="", + default="Missense_Mutation", ) parser.add_argument( @@ -25,9 +25,9 @@ def parsearg_utils(): parser.add_argument( "--requestsCache", - type=str, - default="", - help="Optional: Requests cache (str)", + type=bool, + default=False, + help="Optional: Requests cache; default False (bool)", ) # TODO: add logging functionality @@ -44,11 +44,8 @@ def main(): # required argument config.set_output_dir(args.outDir) - try: - if args.requestsCache != "": - config.set_request_cache(args.requestsCache) - except AttributeError: - pass + # optional argument + config.set_request_cache(args.requestsCache) df_cbioportal = io_utils.concatenate_csv_files_with_glob("*_mutations.csv") diff --git a/src/missense_kinase_toolkit/config.py b/src/missense_kinase_toolkit/config.py index 8efa52f..7f05065 100644 --- a/src/missense_kinase_toolkit/config.py +++ b/src/missense_kinase_toolkit/config.py @@ -107,7 +107,7 @@ def maybe_get_cbioportal_token( def set_request_cache( - val: str + val: bool ) -> None: """Set the request cache path in environment variables @@ -120,7 +120,8 @@ def set_request_cache( ------- None """ - os.environ[REQUEST_CACHE_VAR] = val + #TODO: val should be bool but doesn't work with env, fix + os.environ[REQUEST_CACHE_VAR] = str(val) def maybe_get_request_cache( diff --git a/src/missense_kinase_toolkit/io_utils.py b/src/missense_kinase_toolkit/io_utils.py index e6f3188..c005461 100644 --- a/src/missense_kinase_toolkit/io_utils.py +++ b/src/missense_kinase_toolkit/io_utils.py @@ -48,6 +48,7 @@ def save_dataframe_to_csv( def concatenate_csv_files_with_glob( str_find: str, + str_remove: str = "transformed_mutations.csv", ) -> pd.DataFrame: """Use glob to find csv files to concatenate @@ -66,11 +67,12 @@ def concatenate_csv_files_with_glob( str_find = str_find.replace(".csv", "") + ".csv" path_data = check_outdir_exists() csv_files = glob.glob(os.path.join(path_data, str_find)) + csv_files = [csv_file for csv_file in csv_files if str_remove not in csv_file] df_combo = pd.DataFrame() if len(csv_files) > 0: for csv_file in csv_files: - df = pd.read_csv(csv_file) + df = pd.read_csv(csv_file, low_memory=False) df_combo = pd.concat([df_combo, df]) else: print(f"No files matching {str_find} found in {path_data}...") diff --git a/src/missense_kinase_toolkit/pfam.py b/src/missense_kinase_toolkit/pfam.py index d398326..d99dbe1 100644 --- a/src/missense_kinase_toolkit/pfam.py +++ b/src/missense_kinase_toolkit/pfam.py @@ -2,7 +2,7 @@ import pandas as pd -from missense_kinase_toolkit import requests_wrapper +from missense_kinase_toolkit import requests_wrapper, utils_requests def retrieve_pfam( @@ -21,7 +21,6 @@ def retrieve_pfam( DataFrame with Pfam domain information if request is successful, UniProt ID if request fails; None if response is empty """ - url = f"https://www.ebi.ac.uk/interpro/api/entry/pfam/protein/UniProt/{uniprot_id}" header = {"Accept": "application/json"} @@ -31,146 +30,72 @@ def retrieve_pfam( ) if res.ok: - dict_json = json.loads(res.text)["results"] - try: - df1_out = pd.DataFrame() - df2_out = pd.DataFrame() - - for entry in dict_json: - df1_temp = pd.DataFrame.from_dict( - entry["metadata"], orient="index" - ).transpose() - df1_out = pd.concat([df1_out, df1_temp]).reset_index(drop=True) - - df2_temp = pd.DataFrame.from_dict( - entry["proteins"][0], orient="index" - ).transpose() - df2_out = pd.concat([df2_out, df2_temp]).reset_index(drop=True) - - df1_out = df1_out.rename(columns={"accession": "pfam_accession"}) - df2_out = df2_out.rename( - columns={ - "accession": "uniprot_accession", - "source_database": "review_status", - } - ) - - df_out = pd.concat([df1_out, df2_out], axis=1) - df_out = df_out.explode("entry_protein_locations").reset_index(drop=True) - - list_entry = ["model", "score"] - for entry in list_entry: - df_out[entry] = df_out["entry_protein_locations"].apply( - lambda x: x[entry] - ) - - list_fragments = ["start", "end", "dc-status"] - for entry in list_fragments: - df_out[entry] = df_out["entry_protein_locations"].apply( - lambda x: x["fragments"][0][entry] - ) - - del df_out["entry_protein_locations"] - - return df_out - except KeyError: - print("Error:") - print(dict_json) - print() + if len(res.text) == 0: + print(f"No PFAM domains found: {uniprot_id}") return None - else: - return uniprot_id - - -def concat_pfam( - iter_uniprot: iter[str], - iter_hgnc: iter[str], -) -> tuple[pd.DataFrame, dict[str, str], dict[str, str]]: - """Concatenate Pfam domain information for a list of UniProt IDs - - Parameters - ---------- - iter_uniprot : iter[str] - Iterable of UniProt IDs - iter_hgnc : iter[str] - Iterable of HGNC symbols - - Returns - ------- - pd.DataFrame - DataFrame with Pfam domain information - dict[str, str] - Dictionary of HGNC symbols and UniProt IDs with errors - dict[str, str] - Dictionary of HGNC symbols and UniProt IDs with missing information - """ - dict_error = {} - dict_missing = {} - df = pd.DataFrame() - - for uniprot, hgnc in zip(iter_uniprot, iter_hgnc): - temp = retrieve_pfam(uniprot) - - if temp is None: - dict_error[hgnc] = uniprot - if type(temp) is str: - dict_missing[hgnc] = uniprot else: - temp.insert(0, "hgnc", hgnc) - df = pd.concat([df, temp]).reset_index(drop=True) - - return df, dict_error, dict_missing - - -def extract_numeric( - input_string: str, -) -> str: - """Extract numeric characters from a string - - Parameters - ---------- - input_string : str - Input string - - Returns - ------- - str - Numeric characters extracted from the input string - """ - num = "" - for i in input_string: - if i.isdigit(): - num = num + i - return num - + list_json = json.loads(res.text)["results"] + + # metadata for UniProt ID + list_metadata = [entry["metadata"] for entry in list_json] + list_metadata = [{"pfam_accession" if k == "accession" else k:v for k,v in entry.items()} for entry in list_metadata] + + # Pfam domains locations + list_locations = [entry["proteins"][0]["entry_protein_locations"][0]["fragments"][0] for entry in list_json] + + # model information + list_model = [entry["proteins"][0]["entry_protein_locations"][0] for entry in list_json] + [entry.pop("fragments", None) for entry in list_model] + + # protein information + # do last because pop is an in-place operation + list_protein = [entry["proteins"][0] for entry in list_json] + [entry.pop("entry_protein_locations", None) for entry in list_protein] + list_protein = [{"uniprot" if k == "accession" else k:v for k,v in entry.items()} for entry in list_protein] + + df_concat = pd.concat( + [ + pd.DataFrame(list_protein), + pd.DataFrame(list_metadata), + pd.DataFrame(list_locations), + pd.DataFrame(list_model) + ], + axis=1 + ) -def find_pfam( - input_hgnc: str, - input_position: int, - df_ref: pd.DataFrame, -) -> str | None: - """Find Pfam domain for a given HGNC symbol and position + return df_concat + else: + utils_requests.print_status_code_if_res_not_ok(res) + return None - Parameters - ---------- - input_hgnc : str - HGNC symbol - input_position : int - Codon position - df_ref : pd.DataFrame - DataFrame with Pfam domain information - Returns - ------- - str | None - Pfam domain if found, None if not found - """ - df_temp = df_ref.loc[df_ref["hgnc"] == input_hgnc].reset_index() - try: - domain = df_temp.loc[ - ((input_position >= df_temp["start"]) & (input_position <= df_temp["end"])), - "name", - ].values[0] - return domain - except IndexError: - return None +# def find_pfam( +# input_hgnc: str, +# input_position: int, +# df_ref: pd.DataFrame, +# ) -> str | None: +# """Find Pfam domain for a given HGNC symbol and position + +# Parameters +# ---------- +# input_hgnc : str +# HGNC symbol +# input_position : int +# Codon position +# df_ref : pd.DataFrame +# DataFrame with Pfam domain information + +# Returns +# ------- +# str | None +# Pfam domain if found, None if not found +# """ +# df_temp = df_ref.loc[df_ref["hgnc"] == input_hgnc].reset_index() +# try: +# domain = df_temp.loc[ +# ((input_position >= df_temp["start"]) & (input_position <= df_temp["end"])), +# "name", +# ].values[0] +# return domain +# except IndexError: +# return None diff --git a/src/nextflow/README.MD b/src/nextflow/README.MD index c27e1b1..c95c83d 100644 --- a/src/nextflow/README.MD +++ b/src/nextflow/README.MD @@ -15,10 +15,10 @@ Generate own `params.json` file using the following parameters: Below is a description of what each variable should contain. If variable is optional and not in use, do not create any entry in the `json` file. -| Variable | Optional | Description | -| :--------------------| :------: | :---------- | -| 'CBIOPORTAL_COHORT' | No | cBioPortal cohort to analyze | -| 'OUTPUT_DIR' | No | Path to outdir to save data | -| 'CBIOPORTAL_INSTANCE'| Yes | `cbioportal.org` if none provided | -| 'CBIOPORTAL_TOKEN' | Yes | Data Access Token if using private instance| -| 'REQUESTS_CACHE' | Yes | Path to dir to cache requests data | +| Variable | Optional | Description | +| :--------------------| :------: | :----------------------------------------- | +| CBIOPORTAL_COHORT | No | cBioPortal cohort to analyze | +| OUTPUT_DIR | No | Path to outdir to save data | +| CBIOPORTAL_INSTANCE | Yes | `cbioportal.org` if none provided | +| CBIOPORTAL_TOKEN | Yes | Data Access Token if using private instance| +| REQUESTS_CACHE | Yes | Boolean of whether to cache requests data |