Skip to content

Commit

Permalink
reformatted cbioportal pipeline for nextflow compatibility (#13)
Browse files Browse the repository at this point in the history
* reformatted cbioportal pipeline for nextflow compatibility

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor in progress changes in process_cbioportal.nf

* added shebang to process_cbioportal.py

* updated cbioportal scripts to conform to NF ETL pipeline

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* interim commit for transform_cbioportal
resolved conflicts in cbioportal.py

* added transform_cbioportal CLI code

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* .gitignore and pyproject.toml changes for transform_cbioportal CLI changes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
jessicaw9910 and pre-commit-ci[bot] authored Apr 5, 2024
1 parent 1f1e326 commit 000fab3
Show file tree
Hide file tree
Showing 13 changed files with 415 additions and 120 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -185,5 +185,7 @@ $RECYCLE.BIN/
# End of https://www.toptal.com/developers/gitignore/api/osx,windows,linux

# Requests cache directory
requests_cache/
data_cache/
# requests_cache/
# data_cache/
*params.json
True.sqlite
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ pytest-runner = "^6.0.1"
[tool.pytest.ini_options]
pythonpath = ["src"]

[tool.poetry.scripts]
extract_cbioportal = "missense_kinase_toolkit.cli.extract_cbioportal:main"
transform_cbioportal = "missense_kinase_toolkit.cli.transform_cbioportal:main"

[tool.poetry-dynamic-versioning]
enable = true
vcs = "git"
Expand Down
113 changes: 13 additions & 100 deletions src/missense_kinase_toolkit/cbioportal.py
Original file line number Diff line number Diff line change
@@ -1,74 +1,16 @@
#!/usr/bin/env python3

from __future__ import annotations

import os
import pandas as pd
import sys

from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient


CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
DATA_CACHE_DIR = "DATA_CACHE"
CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT"


def maybe_get_cbioportal_token_from_env(
) -> str | None:
"""Get the cBioPortal token from the environment
Returns
-------
str | None
cBioPortal token as string if exists, otherwise None
"""
try:
token = os.environ[CBIOPORTAL_TOKEN_VAR]
except KeyError:
token = None

return token


def maybe_get_cbioportal_instance_from_env(
) -> str | None:
"""Get the cBioPortal instance from the environment
Returns
-------
str | None
cBioPortal instance as string if exists, otherwise None
"""
try:
instance = os.environ[CBIOPORTAL_INSTANCE_VAR]
except KeyError:
instance = None

return instance


def maybe_get_cbioportal_cohort_from_env(
) -> str | None:
"""Get the cBioPortal instance from the environment
Returns
-------
str | None
cBioPortal instance as string if exists, otherwise None
"""
try:
instance = os.environ[CBIOPORTAL_COHORT_VAR]
except KeyError:
print("Cohort not found in environment variables. This is necessary to run analysis. Exiting...")
sys.exit(1)

return instance
from missense_kinase_toolkit import config, io_utils


def get_all_mutations_by_study(
study_id: str,
) -> list | None:
"""Get mutations cBioPortal data
Expand All @@ -77,19 +19,11 @@ def get_all_mutations_by_study(
list | None
cBioPortal data of Abstract Base Classes objects if successful, otherwise None
"""
token = maybe_get_cbioportal_token_from_env()

instance = maybe_get_cbioportal_instance_from_env()
if instance is not None:
url = f"https://{instance}/api/v2/api-docs"
else:
url = "https://cbioportal.org/api/v2/api-docs"

# Zehir, 2017 MSKCC sequencing cohort is "msk_impact_2017"
# MSKCC clinical sequencing cohort is "mskimpact"
study_id = maybe_get_cbioportal_cohort_from_env()
instance = config.get_cbioportal_instance()
url = f"https://{instance}/api/v2/api-docs"
token = config.maybe_get_cbioportal_token()

if all(v is not None for v in (token, instance)):
if token is not None:
http_client = RequestsClient()
http_client.set_api_key(
instance,
Expand Down Expand Up @@ -160,42 +94,21 @@ def parse_iterabc2dataframe(
dict_dir[attr].append(None)

df = pd.DataFrame.from_dict(dict_dir)
df = df[sorted(df.columns.to_list())]

return df


def save_cbioportal_data_to_csv(
df: pd.DataFrame,
def get_and_save_cbioportal_cohort(
study_id: str,
) -> None:
"""Save cBioPortal data to a CSV file
Parameters
----------
df : pd.DataFrame
Dataframe of cBioPortal data
muts = get_all_mutations_by_study(study_id)

Returns
-------
None
"""
try:
path_data = os.environ[DATA_CACHE_DIR]
if not os.path.exists(path_data):
os.makedirs(path_data)
study_id = maybe_get_cbioportal_cohort_from_env()
df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
except KeyError:
print("DATA_CACHE not found in environment variables...")


def main():
muts = get_all_mutations_by_study()
df_muts = parse_iterabc2dataframe(muts)
df_genes = parse_iterabc2dataframe(df_muts["gene"])
df_combo = pd.concat([df_muts, df_genes], axis=1)
df_combo = df_combo.drop(['gene'], axis=1)
save_cbioportal_data_to_csv(df_combo)
df_combo = df_combo.drop(["gene"], axis=1)

filename = f"{study_id}_mutations.csv"

if __name__ == "__main__":
main()
io_utils.save_dataframe_to_csv(df_combo, filename)
Empty file.
76 changes: 76 additions & 0 deletions src/missense_kinase_toolkit/cli/extract_cbioportal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python

import argparse

from missense_kinase_toolkit import config, cbioportal

def parsearg_utils():
parser = argparse.ArgumentParser(
description="Get mutations from cBioPortal cohort and instance"
)

parser.add_argument(
"--cohort",
type=str,
help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)",
default="msk_impact_2017",
)

parser.add_argument(
"--outDir",
type=str,
help="Required: Output directory path (str)",
)

parser.add_argument(
"--instance",
type=str,
help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `www.cbioportal.org` (str)",
default="www.cbioportal.org",
)

parser.add_argument(
"--token",
type=str,
default="",
help="Optional: cBioPortal API token (str)",
)

# parser.add_argument(
# "--requestsCache",
# type=str,
# default="",
# help="Optional: Requests cache (str)",
# )

# TODO: add logging functionality
return parser


def main():
args = parsearg_utils().parse_args()

str_studies = args.cohort
list_studies = str_studies.split(",")
list_studies = [study.strip() for study in list_studies]

# required argument
config.set_output_dir(args.outDir)

# optional arguments
config.set_cbioportal_instance(args.instance)

try:
if args.token != "":
config.set_cbioportal_token(args.token)
except AttributeError:
pass

# try:
# if args.requestsCache != "":
# config.set_request_cache(args.requestsCache)
# except AttributeError:
# pass

for study in list_studies:
cbioportal.get_and_save_cbioportal_cohort(study)
70 changes: 70 additions & 0 deletions src/missense_kinase_toolkit/cli/transform_cbioportal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python

import argparse

from missense_kinase_toolkit import config, scrapers, io_utils


def parsearg_utils():
parser = argparse.ArgumentParser(
description="Concatenate, remove duplicates, and extract genes and mutation types of interest"
)

parser.add_argument(
"--mutations",
type=str,
help="Optional: Mutation type(s) to extract, separated by commas (e.g., `Missense_Mutation`) (str)",
default="",
)

parser.add_argument(
"--outDir",
type=str,
help="Required: Output directory path (str)",
)

parser.add_argument(
"--requestsCache",
type=str,
default="",
help="Optional: Requests cache (str)",
)

# TODO: add logging functionality
return parser


def main():
args = parsearg_utils().parse_args()

str_mutations = args.mutations
list_mutations = str_mutations.split(",")
list_mutations = [mutation.strip() for mutation in list_mutations]

# required argument
config.set_output_dir(args.outDir)

try:
if args.requestsCache != "":
config.set_request_cache(args.requestsCache)
except AttributeError:
pass

df_cbioportal = io_utils.concatenate_csv_files_with_glob("*_mutations.csv")

df_kinhub = scrapers.kinhub()
io_utils.save_dataframe_to_csv(df_kinhub, "kinhub.csv")

list_kinase_hgnc = df_kinhub["HGNC Name"].to_list()

df_subset = df_cbioportal.loc[df_cbioportal["mutationType"].isin(list_mutations), ].reset_index(drop=True)
df_subset = df_subset.loc[df_subset["hugoGeneSymbol"].isin(list_kinase_hgnc), ].reset_index(drop=True)

list_cols = ["HGNC Name", "UniprotID"]
df_subset_merge = df_subset.merge(df_kinhub[list_cols],
how = "left",
left_on = "hugoGeneSymbol",
right_on = "HGNC Name")
df_subset_merge = df_subset_merge.drop(["HGNC Name"], axis=1)

io_utils.save_dataframe_to_csv(df_subset_merge, "transformed_mutations.csv")
Loading

0 comments on commit 000fab3

Please sign in to comment.