Skip to content

Commit

Permalink
Nextflow (#14)
Browse files Browse the repository at this point in the history
* reformatted cbioportal pipeline for nextflow compatibility

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor in progress changes in process_cbioportal.nf

* added shebang to process_cbioportal.py

* updated cbioportal scripts to conform to NF ETL pipeline

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* interim commit for transform_cbioportal
resolved conflicts in cbioportal.py

* added transform_cbioportal CLI code

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* .gitignore and pyproject.toml changes for transform_cbioportal CLI changes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updated NF README

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updated cli and pfam scripts

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
jessicaw9910 and pre-commit-ci[bot] authored Apr 5, 2024
1 parent 000fab3 commit a300939
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 172 deletions.
14 changes: 1 addition & 13 deletions src/missense_kinase_toolkit/cli/extract_cbioportal.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,8 @@ def parsearg_utils():
help="Optional: cBioPortal API token (str)",
)

# parser.add_argument(
# "--requestsCache",
# type=str,
# default="",
# help="Optional: Requests cache (str)",
# )

# TODO: add logging functionality
# TODO: cache requests for cBioPortal API
return parser


Expand All @@ -66,11 +60,5 @@ def main():
except AttributeError:
pass

# try:
# if args.requestsCache != "":
# config.set_request_cache(args.requestsCache)
# except AttributeError:
# pass

for study in list_studies:
cbioportal.get_and_save_cbioportal_cohort(study)
15 changes: 6 additions & 9 deletions src/missense_kinase_toolkit/cli/transform_cbioportal.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def parsearg_utils():
"--mutations",
type=str,
help="Optional: Mutation type(s) to extract, separated by commas (e.g., `Missense_Mutation`) (str)",
default="",
default="Missense_Mutation",
)

parser.add_argument(
Expand All @@ -25,9 +25,9 @@ def parsearg_utils():

parser.add_argument(
"--requestsCache",
type=str,
default="",
help="Optional: Requests cache (str)",
type=bool,
default=False,
help="Optional: Requests cache; default False (bool)",
)

# TODO: add logging functionality
Expand All @@ -44,11 +44,8 @@ def main():
# required argument
config.set_output_dir(args.outDir)

try:
if args.requestsCache != "":
config.set_request_cache(args.requestsCache)
except AttributeError:
pass
# optional argument
config.set_request_cache(args.requestsCache)

df_cbioportal = io_utils.concatenate_csv_files_with_glob("*_mutations.csv")

Expand Down
5 changes: 3 additions & 2 deletions src/missense_kinase_toolkit/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def maybe_get_cbioportal_token(


def set_request_cache(
val: str
val: bool
) -> None:
"""Set the request cache path in environment variables
Expand All @@ -120,7 +120,8 @@ def set_request_cache(
-------
None
"""
os.environ[REQUEST_CACHE_VAR] = val
#TODO: val should be bool but doesn't work with env, fix
os.environ[REQUEST_CACHE_VAR] = str(val)


def maybe_get_request_cache(
Expand Down
4 changes: 3 additions & 1 deletion src/missense_kinase_toolkit/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def save_dataframe_to_csv(

def concatenate_csv_files_with_glob(
str_find: str,
str_remove: str = "transformed_mutations.csv",
) -> pd.DataFrame:
"""Use glob to find csv files to concatenate
Expand All @@ -66,11 +67,12 @@ def concatenate_csv_files_with_glob(
str_find = str_find.replace(".csv", "") + ".csv"
path_data = check_outdir_exists()
csv_files = glob.glob(os.path.join(path_data, str_find))
csv_files = [csv_file for csv_file in csv_files if str_remove not in csv_file]

df_combo = pd.DataFrame()
if len(csv_files) > 0:
for csv_file in csv_files:
df = pd.read_csv(csv_file)
df = pd.read_csv(csv_file, low_memory=False)
df_combo = pd.concat([df_combo, df])
else:
print(f"No files matching {str_find} found in {path_data}...")
Expand Down
205 changes: 65 additions & 140 deletions src/missense_kinase_toolkit/pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

from missense_kinase_toolkit import requests_wrapper
from missense_kinase_toolkit import requests_wrapper, utils_requests


def retrieve_pfam(
Expand All @@ -21,7 +21,6 @@ def retrieve_pfam(
DataFrame with Pfam domain information if request is successful, UniProt ID if request fails;
None if response is empty
"""

url = f"https://www.ebi.ac.uk/interpro/api/entry/pfam/protein/UniProt/{uniprot_id}"

header = {"Accept": "application/json"}
Expand All @@ -31,146 +30,72 @@ def retrieve_pfam(
)

if res.ok:
dict_json = json.loads(res.text)["results"]
try:
df1_out = pd.DataFrame()
df2_out = pd.DataFrame()

for entry in dict_json:
df1_temp = pd.DataFrame.from_dict(
entry["metadata"], orient="index"
).transpose()
df1_out = pd.concat([df1_out, df1_temp]).reset_index(drop=True)

df2_temp = pd.DataFrame.from_dict(
entry["proteins"][0], orient="index"
).transpose()
df2_out = pd.concat([df2_out, df2_temp]).reset_index(drop=True)

df1_out = df1_out.rename(columns={"accession": "pfam_accession"})
df2_out = df2_out.rename(
columns={
"accession": "uniprot_accession",
"source_database": "review_status",
}
)

df_out = pd.concat([df1_out, df2_out], axis=1)
df_out = df_out.explode("entry_protein_locations").reset_index(drop=True)

list_entry = ["model", "score"]
for entry in list_entry:
df_out[entry] = df_out["entry_protein_locations"].apply(
lambda x: x[entry]
)

list_fragments = ["start", "end", "dc-status"]
for entry in list_fragments:
df_out[entry] = df_out["entry_protein_locations"].apply(
lambda x: x["fragments"][0][entry]
)

del df_out["entry_protein_locations"]

return df_out
except KeyError:
print("Error:")
print(dict_json)
print()
if len(res.text) == 0:
print(f"No PFAM domains found: {uniprot_id}")
return None
else:
return uniprot_id


def concat_pfam(
iter_uniprot: iter[str],
iter_hgnc: iter[str],
) -> tuple[pd.DataFrame, dict[str, str], dict[str, str]]:
"""Concatenate Pfam domain information for a list of UniProt IDs
Parameters
----------
iter_uniprot : iter[str]
Iterable of UniProt IDs
iter_hgnc : iter[str]
Iterable of HGNC symbols
Returns
-------
pd.DataFrame
DataFrame with Pfam domain information
dict[str, str]
Dictionary of HGNC symbols and UniProt IDs with errors
dict[str, str]
Dictionary of HGNC symbols and UniProt IDs with missing information
"""
dict_error = {}
dict_missing = {}
df = pd.DataFrame()

for uniprot, hgnc in zip(iter_uniprot, iter_hgnc):
temp = retrieve_pfam(uniprot)

if temp is None:
dict_error[hgnc] = uniprot
if type(temp) is str:
dict_missing[hgnc] = uniprot
else:
temp.insert(0, "hgnc", hgnc)
df = pd.concat([df, temp]).reset_index(drop=True)

return df, dict_error, dict_missing


def extract_numeric(
input_string: str,
) -> str:
"""Extract numeric characters from a string
Parameters
----------
input_string : str
Input string
Returns
-------
str
Numeric characters extracted from the input string
"""
num = ""
for i in input_string:
if i.isdigit():
num = num + i
return num

list_json = json.loads(res.text)["results"]

# metadata for UniProt ID
list_metadata = [entry["metadata"] for entry in list_json]
list_metadata = [{"pfam_accession" if k == "accession" else k:v for k,v in entry.items()} for entry in list_metadata]

# Pfam domains locations
list_locations = [entry["proteins"][0]["entry_protein_locations"][0]["fragments"][0] for entry in list_json]

# model information
list_model = [entry["proteins"][0]["entry_protein_locations"][0] for entry in list_json]
[entry.pop("fragments", None) for entry in list_model]

# protein information
# do last because pop is an in-place operation
list_protein = [entry["proteins"][0] for entry in list_json]
[entry.pop("entry_protein_locations", None) for entry in list_protein]
list_protein = [{"uniprot" if k == "accession" else k:v for k,v in entry.items()} for entry in list_protein]

df_concat = pd.concat(
[
pd.DataFrame(list_protein),
pd.DataFrame(list_metadata),
pd.DataFrame(list_locations),
pd.DataFrame(list_model)
],
axis=1
)

def find_pfam(
input_hgnc: str,
input_position: int,
df_ref: pd.DataFrame,
) -> str | None:
"""Find Pfam domain for a given HGNC symbol and position
return df_concat
else:
utils_requests.print_status_code_if_res_not_ok(res)
return None

Parameters
----------
input_hgnc : str
HGNC symbol
input_position : int
Codon position
df_ref : pd.DataFrame
DataFrame with Pfam domain information

Returns
-------
str | None
Pfam domain if found, None if not found
"""
df_temp = df_ref.loc[df_ref["hgnc"] == input_hgnc].reset_index()
try:
domain = df_temp.loc[
((input_position >= df_temp["start"]) & (input_position <= df_temp["end"])),
"name",
].values[0]
return domain
except IndexError:
return None
# def find_pfam(
# input_hgnc: str,
# input_position: int,
# df_ref: pd.DataFrame,
# ) -> str | None:
# """Find Pfam domain for a given HGNC symbol and position

# Parameters
# ----------
# input_hgnc : str
# HGNC symbol
# input_position : int
# Codon position
# df_ref : pd.DataFrame
# DataFrame with Pfam domain information

# Returns
# -------
# str | None
# Pfam domain if found, None if not found
# """
# df_temp = df_ref.loc[df_ref["hgnc"] == input_hgnc].reset_index()
# try:
# domain = df_temp.loc[
# ((input_position >= df_temp["start"]) & (input_position <= df_temp["end"])),
# "name",
# ].values[0]
# return domain
# except IndexError:
# return None
14 changes: 7 additions & 7 deletions src/nextflow/README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ Generate own `params.json` file using the following parameters:

Below is a description of what each variable should contain. If variable is optional and not in use, do not create any entry in the `json` file.

| Variable | Optional | Description |
| :--------------------| :------: | :---------- |
| 'CBIOPORTAL_COHORT' | No | cBioPortal cohort to analyze |
| 'OUTPUT_DIR' | No | Path to outdir to save data |
| 'CBIOPORTAL_INSTANCE'| Yes | `cbioportal.org` if none provided |
| 'CBIOPORTAL_TOKEN' | Yes | Data Access Token if using private instance|
| 'REQUESTS_CACHE' | Yes | Path to dir to cache requests data |
| Variable | Optional | Description |
| :--------------------| :------: | :----------------------------------------- |
| CBIOPORTAL_COHORT | No | cBioPortal cohort to analyze |
| OUTPUT_DIR | No | Path to outdir to save data |
| CBIOPORTAL_INSTANCE | Yes | `cbioportal.org` if none provided |
| CBIOPORTAL_TOKEN | Yes | Data Access Token if using private instance|
| REQUESTS_CACHE | Yes | Boolean of whether to cache requests data |

0 comments on commit a300939

Please sign in to comment.