Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cbioportal #11

Merged
merged 11 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/CI.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ jobs:
ulimit -a

# More info on options: https://github.com/marketplace/actions/provision-with-micromamba
- uses: mamba-org/provision-with-micromamba@main
# https://github.com/mamba-org/provision-with-micromamba#migration-to-setup-micromamba%60
- uses: mamba-org/setup-micromamba@v1
with:
environment-file: devtools/conda-envs/test_env.yaml
environment-name: test
# conda-forge is the default channel now and does not need to be specified
channels: conda-forge,defaults
extra-specs: |
python=${{ matrix.python-version }}
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,7 @@ $RECYCLE.BIN/
*.lnk

# End of https://www.toptal.com/developers/gitignore/api/osx,windows,linux

# Requests cache directory
requests_cache/
data_cache/
1,232 changes: 1,180 additions & 52 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[tool.poetry]
name = "missense-kinase-toolkit"
# https://github.com/mtkennerly/poetry-dynamic-versioning/issues/14
# https://browniebroke.com/blog/convert-existing-poetry-to-src-layout/
version = "0.0.0"
description = "An ETL pipeline package to facilitate structure-based ML for human kinase property prediction"
authors = ["Jess White <[email protected]>"]
Expand All @@ -23,6 +24,9 @@ tqdm = "4.64.0"
pandas = ">=2,<3"
requests = ">=2.28.1,<3"
requests-cache = ">=0.9.7,<1"
bravado = "^11.0.3"
janitor = "^0.1.1"
beautifulsoup4 = "^4.12.3"



Expand Down
Empty file added src/__init__.py
Empty file.
266 changes: 161 additions & 105 deletions src/missense_kinase_toolkit/cbioportal.py
Original file line number Diff line number Diff line change
@@ -1,145 +1,201 @@
from __future__ import annotations
#!/usr/bin/env python3

import re
from __future__ import annotations

import requests
import os
import pandas as pd
import sys

from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient

def create_setlist(
input_object: requests.models.Response,
attr: str,
) -> tuple[list, set]:
"""Create a list and set of unique values from a response object

Parameters
----------
input_object : requests.models.Response
Response object from a request
attr : str
Attribute to extract from the response object
CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
DATA_CACHE_DIR = "DATA_CACHE"
CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT"


def maybe_get_cbioportal_token_from_env(
) -> str | None:
"""Get the cBioPortal token from the environment

Returns
-------
tuple[list, set]
List and set of unique values from the response object
str | None
cBioPortal token as string if exists, otherwise None
"""
list_output = []
set_output = set()
try:
token = os.environ[CBIOPORTAL_TOKEN_VAR]
except KeyError:
token = None

for entry in input_object:
list_output.append(entry[attr])
set_output.add(entry[attr])
return token

return list_output, set_output

def maybe_get_cbioportal_instance_from_env(
) -> str | None:
"""Get the cBioPortal instance from the environment

def print_counts(
list_input: list,
) -> None:
"""Print the counts of unique values in a list
Returns
-------
str | None
cBioPortal instance as string if exists, otherwise None
"""
try:
instance = os.environ[CBIOPORTAL_INSTANCE_VAR]
except KeyError:
instance = None

Parameters
----------
list_input : list
List of values to count
return instance


def maybe_get_cbioportal_cohort_from_env(
) -> str | None:
"""Get the cBioPortal instance from the environment

Returns
-------
None
str | None
cBioPortal instance as string if exists, otherwise None
"""
for Unique in set(list_input):
n = list_input.count(Unique)
print(f"{Unique:<15} \t {n:>10}")
try:
instance = os.environ[CBIOPORTAL_COHORT_VAR]
except KeyError:
print("Cohort not found in environment variables. This is necessary to run analysis. Exiting...")
sys.exit(1)

return instance

def parse_obj2dict(
input_object: requests.models.Response,
) -> dict:
"""Parse a response object into a dictionary

def get_all_mutations_by_study(
) -> list | None:
"""Get mutations cBioPortal data

Returns
-------
list | None
cBioPortal data of Abstract Base Classes objects if successful, otherwise None
"""
token = maybe_get_cbioportal_token_from_env()

instance = maybe_get_cbioportal_instance_from_env()
if instance is not None:
url = f"https://{instance}/api/v2/api-docs"
else:
url = "https://cbioportal.org/api/v2/api-docs"

# Zehir, 2017 MSKCC sequencing cohort is "msk_impact_2017"
# MSKCC clinical sequencing cohort is "mskimpact"
study_id = maybe_get_cbioportal_cohort_from_env()

if all(v is not None for v in (token, instance)):
http_client = RequestsClient()
http_client.set_api_key(
instance,
f"Bearer {token}",
param_name='Authorization',
param_in='header'
)
cbioportal = SwaggerClient.from_url(
url,
http_client=http_client,
config={
"validate_requests":False,
"validate_responses":False,
"validate_swagger_spec": False
}
)
else:
cbioportal = SwaggerClient.from_url(
url,
config={
"validate_requests":False,
"validate_responses":False,
"validate_swagger_spec": False
}
)

studies = cbioportal.Studies.getAllStudiesUsingGET().result()
study_ids = [study.studyId for study in studies]

if study_id in study_ids:
#TODO: add error handling
#TODO: extract multiple studies
muts = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
molecularProfileId=f"{study_id}_mutations",
sampleListId=f"{study_id}_all",
projection="DETAILED"
).result()
else:
raise ValueError(f"Study {study_id} not found in cBioPortal instance {instance}")

return muts


def parse_iterabc2dataframe(
list_input: iter,
) -> pd.DataFrame:
"""Parse an iterable containing Abstract Base Classes into a dataframe

Parameters
----------
input_object : requests.models.Response
Response object from a request
input_object : iter
Iterable of Abstract Base Classes objects

Returns
-------
dict
Dictionary of values from the response object
pd.DataFrame
Dataframe for the input list of Abstract Base Classes objects
"""
dict_output = {}

list_dir = dir(input_object[0])
list_dir = [dir(entry) for entry in list_input]
set_dir = {item for sublist in list_dir for item in sublist}

for attr in list_dir:
list_attr = []
for entry in input_object:
dict_dir = {attr: [] for attr in set_dir}
for entry in list_input:
for attr in dict_dir.keys():
try:
add = int(entry[attr])
except ValueError:
add = str(entry[attr])
list_attr.append(add)
dict_output[attr] = list_attr
dict_dir[attr].append(getattr(entry, attr))
except AttributeError:
dict_dir[attr].append(None)

return dict_output
df = pd.DataFrame.from_dict(dict_dir)

return df

def parse_series2dict(
series: pd.Series,
strwrap: None | str = None,
delim1: None | str = None,
delim2: None | str = None,
) -> dict:
"""Parse a series into a dictionary

def save_cbioportal_data_to_csv(
df: pd.DataFrame,
) -> None:
"""Save cBioPortal data to a CSV file

Parameters
----------
series : pd.Series
Series to parse
strwrap : None | str
Regular expression to wrap the values in the series
delim1 : None | str
Delimiter to split the values in the series
delim2 : None | str
Delimiter to split the values in the series
df : pd.DataFrame
Dataframe of cBioPortal data

Returns
-------
dict
Dictionary of values from the series
None
"""
if strwrap is None:
strwrap = r"Gene\((.*)\)"
if delim1 is None:
delim1 = ", "
if delim2 is None:
delim2 = "="

list_temp = series.apply(
lambda x: re.search(strwrap, str(x)).group(1).split(delim1)
)
list_keys = [gene.split(delim2)[0] for gene in list_temp[0]]
dict_out = {key: [] for key in list_keys}

for row in list_temp:
list_row = [col.split(delim2)[1] for col in row]
for idx, col in enumerate(list_row):
dict_out[list_keys[idx]].append(col)

return dict_out


def calc_vaf(
dataframe,
alt: None | str = None,
ref: None | str = None,
):
if alt is None:
alt = "tumorAltCount"
if ref is None:
ref = "tumorRefCount"

vaf = dataframe[alt] / (dataframe[alt] + dataframe[ref])

return vaf
try:
path_data = os.environ[DATA_CACHE_DIR]
if not os.path.exists(path_data):
os.makedirs(path_data)
study_id = maybe_get_cbioportal_cohort_from_env()
df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
except KeyError:
print("DATA_CACHE not found in environment variables...")


def main():
muts = get_all_mutations_by_study()
df_muts = parse_iterabc2dataframe(muts)
df_genes = parse_iterabc2dataframe(df_muts["gene"])
df_combo = pd.concat([df_muts, df_genes], axis=1)
df_combo = df_combo.drop(['gene'], axis=1)
save_cbioportal_data_to_csv(df_combo)


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions src/missense_kinase_toolkit/data_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# TODO: Define Pydantic data models for the following data sources:
# cBioPortal mutations
# cBioPortal clinical annotations
# Pfam annotations
# UniProt annotations (cannonical sequence)
# Kinase lists
# KLIFs annotations
33 changes: 33 additions & 0 deletions src/missense_kinase_toolkit/io_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import pandas as pd


DATA_CACHE_DIR = "DATA_CACHE"


def save_dataframe_to_csv(
df: pd.DataFrame,
filename: str,
) -> None:
"""Save a dataframe to a CSV file

Parameters
----------
df : pd.DataFrame
Dataframe to save
output_path : str
Path to save the CSV file

Returns
-------
None
"""
filename = filename.replace(".csv", "") + ".csv"

try:
path_data = os.environ[DATA_CACHE_DIR]
if not os.path.exists(path_data):
os.makedirs(path_data)
df.to_csv(os.path.join(path_data, f"{filename}_mutations.csv"), index=False)
except KeyError:
print("DATA_CACHE not found in environment variables...")
Loading
Loading