Nextflow (#14)

* reformatted cbioportal pipeline for nextflow compatibility * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor in progress changes in process_cbioportal.nf * added shebang to process_cbioportal.py * updated cbioportal scripts to conform to NF ETL pipeline * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * interim commit for transform_cbioportal resolved conflicts in cbioportal.py * added transform_cbioportal CLI code * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * .gitignore and pyproject.toml changes for transform_cbioportal CLI changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated NF README * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated cli and pfam scripts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
choderalab · Apr 5, 2024 · a300939 · a300939
1 parent 000fab3
commit a300939
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 172 deletions.
diff --git a/src/missense_kinase_toolkit/cli/extract_cbioportal.py b/src/missense_kinase_toolkit/cli/extract_cbioportal.py
@@ -36,14 +36,8 @@ def parsearg_utils():
         help="Optional: cBioPortal API token (str)",
     )
 
-    # parser.add_argument(
-    #     "--requestsCache",
-    #     type=str,
-    #     default="",
-    #     help="Optional: Requests cache (str)",
-    # )
-
     # TODO: add logging functionality
+    # TODO: cache requests for cBioPortal API
     return parser
 
 
@@ -66,11 +60,5 @@ def main():
     except AttributeError:
         pass
 
-    # try:
-    #     if args.requestsCache != "":
-    #         config.set_request_cache(args.requestsCache)
-    # except AttributeError:
-    #     pass
-
     for study in list_studies:
         cbioportal.get_and_save_cbioportal_cohort(study)
diff --git a/src/missense_kinase_toolkit/cli/transform_cbioportal.py b/src/missense_kinase_toolkit/cli/transform_cbioportal.py
@@ -14,7 +14,7 @@ def parsearg_utils():
         "--mutations",
         type=str,
         help="Optional: Mutation type(s) to extract, separated by commas (e.g., `Missense_Mutation`) (str)",
-        default="",
+        default="Missense_Mutation",
     )
 
     parser.add_argument(
@@ -25,9 +25,9 @@ def parsearg_utils():
 
     parser.add_argument(
         "--requestsCache",
-        type=str,
-        default="",
-        help="Optional: Requests cache (str)",
+        type=bool,
+        default=False,
+        help="Optional: Requests cache; default False (bool)",
     )
 
     # TODO: add logging functionality
@@ -44,11 +44,8 @@ def main():
     # required argument
     config.set_output_dir(args.outDir)
 
-    try:
-        if args.requestsCache != "":
-            config.set_request_cache(args.requestsCache)
-    except AttributeError:
-        pass
+    # optional argument
+    config.set_request_cache(args.requestsCache)
 
     df_cbioportal = io_utils.concatenate_csv_files_with_glob("*_mutations.csv")
 

diff --git a/src/missense_kinase_toolkit/config.py b/src/missense_kinase_toolkit/config.py
@@ -107,7 +107,7 @@ def maybe_get_cbioportal_token(
 
 
 def set_request_cache(
-    val: str
+    val: bool
 ) -> None:
     """Set the request cache path in environment variables
 
@@ -120,7 +120,8 @@ def set_request_cache(
     -------
     None
     """
-    os.environ[REQUEST_CACHE_VAR] = val
+    #TODO: val should be bool but doesn't work with env, fix
+    os.environ[REQUEST_CACHE_VAR] = str(val)
 
 
 def maybe_get_request_cache(

diff --git a/src/missense_kinase_toolkit/io_utils.py b/src/missense_kinase_toolkit/io_utils.py
@@ -48,6 +48,7 @@ def save_dataframe_to_csv(
 
 def concatenate_csv_files_with_glob(
     str_find: str,
+    str_remove: str = "transformed_mutations.csv",
 ) -> pd.DataFrame:
     """Use glob to find csv files to concatenate
 
@@ -66,11 +67,12 @@ def concatenate_csv_files_with_glob(
     str_find = str_find.replace(".csv", "") + ".csv"
     path_data = check_outdir_exists()
     csv_files = glob.glob(os.path.join(path_data, str_find))
+    csv_files = [csv_file for csv_file in csv_files if str_remove not in csv_file]
 
     df_combo = pd.DataFrame()
     if len(csv_files) > 0:
         for csv_file in csv_files:
-            df = pd.read_csv(csv_file)
+            df = pd.read_csv(csv_file, low_memory=False)
             df_combo = pd.concat([df_combo, df])
     else:
         print(f"No files matching {str_find} found in {path_data}...")

diff --git a/src/missense_kinase_toolkit/pfam.py b/src/missense_kinase_toolkit/pfam.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 
-from missense_kinase_toolkit import requests_wrapper
+from missense_kinase_toolkit import requests_wrapper, utils_requests
 
 
 def retrieve_pfam(
@@ -21,7 +21,6 @@ def retrieve_pfam(
         DataFrame with Pfam domain information if request is successful, UniProt ID if request fails;
           None if response is empty
     """
-
     url = f"https://www.ebi.ac.uk/interpro/api/entry/pfam/protein/UniProt/{uniprot_id}"
 
     header = {"Accept": "application/json"}
@@ -31,146 +30,72 @@ def retrieve_pfam(
     )
 
     if res.ok:
-        dict_json = json.loads(res.text)["results"]
-        try:
-            df1_out = pd.DataFrame()
-            df2_out = pd.DataFrame()
-
-            for entry in dict_json:
-                df1_temp = pd.DataFrame.from_dict(
-                    entry["metadata"], orient="index"
-                ).transpose()
-                df1_out = pd.concat([df1_out, df1_temp]).reset_index(drop=True)
-
-                df2_temp = pd.DataFrame.from_dict(
-                    entry["proteins"][0], orient="index"
-                ).transpose()
-                df2_out = pd.concat([df2_out, df2_temp]).reset_index(drop=True)
-
-            df1_out = df1_out.rename(columns={"accession": "pfam_accession"})
-            df2_out = df2_out.rename(
-                columns={
-                    "accession": "uniprot_accession",
-                    "source_database": "review_status",
-                }
-            )
-
-            df_out = pd.concat([df1_out, df2_out], axis=1)
-            df_out = df_out.explode("entry_protein_locations").reset_index(drop=True)
-
-            list_entry = ["model", "score"]
-            for entry in list_entry:
-                df_out[entry] = df_out["entry_protein_locations"].apply(
-                    lambda x: x[entry]
-                )
-
-            list_fragments = ["start", "end", "dc-status"]
-            for entry in list_fragments:
-                df_out[entry] = df_out["entry_protein_locations"].apply(
-                    lambda x: x["fragments"][0][entry]
-                )
-
-            del df_out["entry_protein_locations"]
-
-            return df_out
-        except KeyError:
-            print("Error:")
-            print(dict_json)
-            print()
+        if len(res.text) == 0:
+            print(f"No PFAM domains found: {uniprot_id}")
             return None
-    else:
-        return uniprot_id
-
-
-def concat_pfam(
-    iter_uniprot: iter[str],
-    iter_hgnc: iter[str],
-) -> tuple[pd.DataFrame, dict[str, str], dict[str, str]]:
-    """Concatenate Pfam domain information for a list of UniProt IDs
-
-    Parameters
-    ----------
-    iter_uniprot : iter[str]
-        Iterable of UniProt IDs
-    iter_hgnc : iter[str]
-        Iterable of HGNC symbols
-
-    Returns
-    -------
-    pd.DataFrame
-        DataFrame with Pfam domain information
-    dict[str, str]
-        Dictionary of HGNC symbols and UniProt IDs with errors
-    dict[str, str]
-        Dictionary of HGNC symbols and UniProt IDs with missing information
-    """
-    dict_error = {}
-    dict_missing = {}
-    df = pd.DataFrame()
-
-    for uniprot, hgnc in zip(iter_uniprot, iter_hgnc):
-        temp = retrieve_pfam(uniprot)
-
-        if temp is None:
-            dict_error[hgnc] = uniprot
-        if type(temp) is str:
-            dict_missing[hgnc] = uniprot
         else:
-            temp.insert(0, "hgnc", hgnc)
-            df = pd.concat([df, temp]).reset_index(drop=True)
-
-    return df, dict_error, dict_missing
-
-
-def extract_numeric(
-    input_string: str,
-) -> str:
-    """Extract numeric characters from a string
-
-    Parameters
-    ----------
-    input_string : str
-        Input string
-
-    Returns
-    -------
-    str
-        Numeric characters extracted from the input string
-    """
-    num = ""
-    for i in input_string:
-        if i.isdigit():
-            num = num + i
-    return num
-
+            list_json = json.loads(res.text)["results"]
+
+            # metadata for UniProt ID
+            list_metadata = [entry["metadata"] for entry in list_json]
+            list_metadata = [{"pfam_accession" if k == "accession" else k:v for k,v in entry.items()} for entry in list_metadata]
+
+            # Pfam domains locations
+            list_locations = [entry["proteins"][0]["entry_protein_locations"][0]["fragments"][0] for entry in list_json]
+
+            # model information
+            list_model = [entry["proteins"][0]["entry_protein_locations"][0] for entry in list_json]
+            [entry.pop("fragments", None) for entry in list_model]
+
+            # protein information
+            # do last because pop is an in-place operation
+            list_protein = [entry["proteins"][0] for entry in list_json]
+            [entry.pop("entry_protein_locations", None) for entry in list_protein]
+            list_protein = [{"uniprot" if k == "accession" else k:v for k,v in entry.items()} for entry in list_protein]
+
+            df_concat = pd.concat(
+                [
+                    pd.DataFrame(list_protein),
+                    pd.DataFrame(list_metadata),
+                    pd.DataFrame(list_locations),
+                    pd.DataFrame(list_model)
+                ],
+                 axis=1
+            )
 
-def find_pfam(
-    input_hgnc: str,
-    input_position: int,
-    df_ref: pd.DataFrame,
-) -> str | None:
-    """Find Pfam domain for a given HGNC symbol and position
+            return df_concat
+    else:
+        utils_requests.print_status_code_if_res_not_ok(res)
+        return None
 
-    Parameters
-    ----------
-    input_hgnc : str
-        HGNC symbol
-    input_position : int
-        Codon position
-    df_ref : pd.DataFrame
-        DataFrame with Pfam domain information
 
-    Returns
-    -------
-    str | None
-        Pfam domain if found, None if not found
-    """
-    df_temp = df_ref.loc[df_ref["hgnc"] == input_hgnc].reset_index()
-    try:
-        domain = df_temp.loc[
-            ((input_position >= df_temp["start"]) & (input_position <= df_temp["end"])),
-            "name",
-        ].values[0]
-        return domain
-    except IndexError:
-        return None
+# def find_pfam(
+#     input_hgnc: str,
+#     input_position: int,
+#     df_ref: pd.DataFrame,
+# ) -> str | None:
+#     """Find Pfam domain for a given HGNC symbol and position
+
+#     Parameters
+#     ----------
+#     input_hgnc : str
+#         HGNC symbol
+#     input_position : int
+#         Codon position
+#     df_ref : pd.DataFrame
+#         DataFrame with Pfam domain information
+
+#     Returns
+#     -------
+#     str | None
+#         Pfam domain if found, None if not found
+#     """
+#     df_temp = df_ref.loc[df_ref["hgnc"] == input_hgnc].reset_index()
+#     try:
+#         domain = df_temp.loc[
+#             ((input_position >= df_temp["start"]) & (input_position <= df_temp["end"])),
+#             "name",
+#         ].values[0]
+#         return domain
+#     except IndexError:
+#         return None
diff --git a/src/nextflow/README.MD b/src/nextflow/README.MD
@@ -15,10 +15,10 @@ Generate own `params.json` file using the following parameters:
 
 Below is a description of what each variable should contain. If variable is optional and not in use, do not create any entry in the `json` file.
 
-| Variable             | Optional | Description |
-| :--------------------| :------: | :---------- |
-| 'CBIOPORTAL_COHORT'  |    No    | cBioPortal cohort to analyze |
-| 'OUTPUT_DIR'         |    No    | Path to outdir to save data |
-| 'CBIOPORTAL_INSTANCE'|    Yes   | `cbioportal.org` if none provided |
-| 'CBIOPORTAL_TOKEN'   |    Yes   | Data Access Token if using private instance|
-| 'REQUESTS_CACHE'     |    Yes   | Path to dir to cache requests data |
+| Variable             | Optional | Description                                |
+| :--------------------| :------: | :----------------------------------------- |
+| CBIOPORTAL_COHORT    |    No    | cBioPortal cohort to analyze               |
+| OUTPUT_DIR           |    No    | Path to outdir to save data                |
+| CBIOPORTAL_INSTANCE  |    Yes   | `cbioportal.org` if none provided          |
+| CBIOPORTAL_TOKEN     |    Yes   | Data Access Token if using private instance|
+| REQUESTS_CACHE       |    Yes   | Boolean of whether to cache requests data  |