choderalab · jessicaw9910 · Apr 3, 2024 · Feb 21, 2024 · Feb 21, 2024 · Apr 2, 2024
diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml
@@ -35,10 +35,12 @@ jobs:
           ulimit -a
 
       # More info on options: https://github.com/marketplace/actions/provision-with-micromamba
-      - uses: mamba-org/provision-with-micromamba@main
+      # https://github.com/mamba-org/provision-with-micromamba#migration-to-setup-micromamba%60
+      - uses: mamba-org/setup-micromamba@v1
         with:
           environment-file: devtools/conda-envs/test_env.yaml
           environment-name: test
+          # conda-forge is the default channel now and does not need to be specified
           channels: conda-forge,defaults
           extra-specs: |
             python=${{ matrix.python-version }}

diff --git a/.gitignore b/.gitignore
@@ -183,3 +183,7 @@ $RECYCLE.BIN/
 *.lnk
 
 # End of https://www.toptal.com/developers/gitignore/api/osx,windows,linux
+
+# Requests cache directory
+requests_cache/
+data_cache/
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,7 @@
 [tool.poetry]
 name = "missense-kinase-toolkit"
 # https://github.com/mtkennerly/poetry-dynamic-versioning/issues/14
+# https://browniebroke.com/blog/convert-existing-poetry-to-src-layout/
 version = "0.0.0"
 description = "An ETL pipeline package to facilitate structure-based ML for human kinase property prediction"
 authors = ["Jess White <[email protected]>"]
@@ -23,6 +24,9 @@ tqdm = "4.64.0"
 pandas = ">=2,<3"
 requests = ">=2.28.1,<3"
 requests-cache = ">=0.9.7,<1"
+bravado = "^11.0.3"
+janitor = "^0.1.1"
+beautifulsoup4 = "^4.12.3"
 
 
 

diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py
@@ -1,145 +1,201 @@
-from __future__ import annotations
+#!/usr/bin/env python3
 
-import re
+from __future__ import annotations
 
-import requests
+import os
 import pandas as pd
+import sys
 
+from bravado.client import SwaggerClient
+from bravado.requests_client import RequestsClient
 
-def create_setlist(
-    input_object: requests.models.Response,
-    attr: str,
-) -> tuple[list, set]:
-    """Create a list and set of unique values from a response object
 
-    Parameters
-    ----------
-    input_object : requests.models.Response
-        Response object from a request
-    attr : str
-        Attribute to extract from the response object
+CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
+CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
+DATA_CACHE_DIR = "DATA_CACHE"
+CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT"
+
+
+def maybe_get_cbioportal_token_from_env(
+) -> str | None:
+    """Get the cBioPortal token from the environment
 
     Returns
     -------
-    tuple[list, set]
-        List and set of unique values from the response object
+    str | None
+        cBioPortal token as string if exists, otherwise None
     """
-    list_output = []
-    set_output = set()
+    try:
+        token = os.environ[CBIOPORTAL_TOKEN_VAR]
+    except KeyError:
+        token = None
 
-    for entry in input_object:
-        list_output.append(entry[attr])
-        set_output.add(entry[attr])
+    return token
 
-    return list_output, set_output
 
+def maybe_get_cbioportal_instance_from_env(
+) -> str | None:
+    """Get the cBioPortal instance from the environment
 
-def print_counts(
-    list_input: list,
-) -> None:
-    """Print the counts of unique values in a list
+    Returns
+    -------
+    str | None
+        cBioPortal instance as string if exists, otherwise None
+    """
+    try:
+        instance = os.environ[CBIOPORTAL_INSTANCE_VAR]
+    except KeyError:
+        instance = None
 
-    Parameters
-    ----------
-    list_input : list
-        List of values to count
+    return instance
+
+
+def maybe_get_cbioportal_cohort_from_env(
+) -> str | None:
+    """Get the cBioPortal instance from the environment
 
     Returns
     -------
-    None
+    str | None
+        cBioPortal instance as string if exists, otherwise None
     """
-    for Unique in set(list_input):
-        n = list_input.count(Unique)
-        print(f"{Unique:<15} \t {n:>10}")
+    try:
+        instance = os.environ[CBIOPORTAL_COHORT_VAR]
+    except KeyError:
+        print("Cohort not found in environment variables. This is necessary to run analysis. Exiting...")
+        sys.exit(1)
 
+    return instance
 
-def parse_obj2dict(
-    input_object: requests.models.Response,
-) -> dict:
-    """Parse a response object into a dictionary
+
+def get_all_mutations_by_study(
+) -> list | None:
+    """Get mutations  cBioPortal data
+
+    Returns
+    -------
+    list | None
+        cBioPortal data of Abstract Base Classes objects if successful, otherwise None
+    """
+    token = maybe_get_cbioportal_token_from_env()
+
+    instance = maybe_get_cbioportal_instance_from_env()
+    if instance is not None:
+        url = f"https://{instance}/api/v2/api-docs"
+    else:
+        url = "https://cbioportal.org/api/v2/api-docs"
+
+    # Zehir, 2017 MSKCC sequencing cohort is "msk_impact_2017"
+    # MSKCC clinical sequencing cohort is "mskimpact"
+    study_id = maybe_get_cbioportal_cohort_from_env()
+
+    if all(v is not None for v in (token, instance)):
+        http_client = RequestsClient()
+        http_client.set_api_key(
+            instance,
+            f"Bearer {token}",
+            param_name='Authorization',
+            param_in='header'
+        )
+        cbioportal = SwaggerClient.from_url(
+            url,
+            http_client=http_client,
+            config={
+                "validate_requests":False,
+                "validate_responses":False,
+                "validate_swagger_spec": False
+            }
+        )
+    else:
+        cbioportal = SwaggerClient.from_url(
+            url,
+            config={
+                "validate_requests":False,
+                "validate_responses":False,
+                "validate_swagger_spec": False
+            }
+        )
+
+    studies = cbioportal.Studies.getAllStudiesUsingGET().result()
+    study_ids = [study.studyId for study in studies]
+
+    if study_id in study_ids:
+        #TODO: add error handling
+        #TODO: extract multiple studies
+        muts = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
+            molecularProfileId=f"{study_id}_mutations",
+            sampleListId=f"{study_id}_all",
+            projection="DETAILED"
+            ).result()
+    else:
+        raise ValueError(f"Study {study_id} not found in cBioPortal instance {instance}")
+
+    return muts
+
+
+def parse_iterabc2dataframe(
+    list_input: iter,
+) -> pd.DataFrame:
+    """Parse an iterable containing Abstract Base Classes into a dataframe
 
     Parameters
     ----------
-    input_object : requests.models.Response
-        Response object from a request
+    input_object : iter
+        Iterable of Abstract Base Classes objects
 
     Returns
     -------
-    dict
-        Dictionary of values from the response object
+    pd.DataFrame
+        Dataframe for the input list of Abstract Base Classes objects
     """
-    dict_output = {}
-
-    list_dir = dir(input_object[0])
+    list_dir = [dir(entry) for entry in list_input]
+    set_dir = {item for sublist in list_dir for item in sublist}
 
-    for attr in list_dir:
-        list_attr = []
-        for entry in input_object:
+    dict_dir = {attr: [] for attr in set_dir}
+    for entry in list_input:
+        for attr in dict_dir.keys():
             try:
-                add = int(entry[attr])
-            except ValueError:
-                add = str(entry[attr])
-            list_attr.append(add)
-        dict_output[attr] = list_attr
+                dict_dir[attr].append(getattr(entry, attr))
+            except AttributeError:
+                dict_dir[attr].append(None)
 
-    return dict_output
+    df = pd.DataFrame.from_dict(dict_dir)
 
+    return df
 
-def parse_series2dict(
-    series: pd.Series,
-    strwrap: None | str = None,
-    delim1: None | str = None,
-    delim2: None | str = None,
-) -> dict:
-    """Parse a series into a dictionary
+
+def save_cbioportal_data_to_csv(
+    df: pd.DataFrame,
+) -> None:
+    """Save cBioPortal data to a CSV file
 
     Parameters
     ----------
-    series : pd.Series
-        Series to parse
-    strwrap : None | str
-        Regular expression to wrap the values in the series
-    delim1 : None | str
-        Delimiter to split the values in the series
-    delim2 : None | str
-        Delimiter to split the values in the series
+    df : pd.DataFrame
+        Dataframe of cBioPortal data
 
     Returns
     -------
-    dict
-        Dictionary of values from the series
+    None
     """
-    if strwrap is None:
-        strwrap = r"Gene\((.*)\)"
-    if delim1 is None:
-        delim1 = ", "
-    if delim2 is None:
-        delim2 = "="
-
-    list_temp = series.apply(
-        lambda x: re.search(strwrap, str(x)).group(1).split(delim1)
-    )
-    list_keys = [gene.split(delim2)[0] for gene in list_temp[0]]
-    dict_out = {key: [] for key in list_keys}
-
-    for row in list_temp:
-        list_row = [col.split(delim2)[1] for col in row]
-        for idx, col in enumerate(list_row):
-            dict_out[list_keys[idx]].append(col)
-
-    return dict_out
-
-
-def calc_vaf(
-    dataframe,
-    alt: None | str = None,
-    ref: None | str = None,
-):
-    if alt is None:
-        alt = "tumorAltCount"
-    if ref is None:
-        ref = "tumorRefCount"
-
-    vaf = dataframe[alt] / (dataframe[alt] + dataframe[ref])
-
-    return vaf
+    try:
+        path_data = os.environ[DATA_CACHE_DIR]
+        if not os.path.exists(path_data):
+            os.makedirs(path_data)
+        study_id = maybe_get_cbioportal_cohort_from_env()
+        df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
+    except KeyError:
+        print("DATA_CACHE not found in environment variables...")
+
+
+def main():
+    muts = get_all_mutations_by_study()
+    df_muts = parse_iterabc2dataframe(muts)
+    df_genes = parse_iterabc2dataframe(df_muts["gene"])
+    df_combo = pd.concat([df_muts, df_genes], axis=1)
+    df_combo = df_combo.drop(['gene'], axis=1)
+    save_cbioportal_data_to_csv(df_combo)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/missense_kinase_toolkit/data_models.py b/src/missense_kinase_toolkit/data_models.py
@@ -0,0 +1,7 @@
+# TODO: Define Pydantic data models for the following data sources:
+# cBioPortal mutations
+# cBioPortal clinical annotations
+# Pfam annotations
+# UniProt annotations (cannonical sequence)
+# Kinase lists
+# KLIFs annotations
diff --git a/src/missense_kinase_toolkit/io_utils.py b/src/missense_kinase_toolkit/io_utils.py
@@ -0,0 +1,33 @@
+import os
+import pandas as pd
+
+
+DATA_CACHE_DIR = "DATA_CACHE"
+
+
+def save_dataframe_to_csv(
+    df: pd.DataFrame,
+    filename: str,
+) -> None:
+    """Save a dataframe to a CSV file
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe to save
+    output_path : str
+        Path to save the CSV file
+
+    Returns
+    -------
+    None
+    """
+    filename = filename.replace(".csv", "") + ".csv"
+
+    try:
+        path_data = os.environ[DATA_CACHE_DIR]
+        if not os.path.exists(path_data):
+            os.makedirs(path_data)
+        df.to_csv(os.path.join(path_data, f"{filename}_mutations.csv"), index=False)
+    except KeyError:
+        print("DATA_CACHE not found in environment variables...")