diff --git a/.gitignore b/.gitignore
index bc4bf39..d667037 100644
--- a/.gitignore
+++ b/.gitignore
@@ -185,5 +185,7 @@ $RECYCLE.BIN/
 # End of https://www.toptal.com/developers/gitignore/api/osx,windows,linux
 
 # Requests cache directory
-requests_cache/
-data_cache/
+# requests_cache/
+# data_cache/
+*params.json
+True.sqlite
diff --git a/pyproject.toml b/pyproject.toml
index 925a25b..a607aed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,10 @@ pytest-runner = "^6.0.1"
 [tool.pytest.ini_options]
 pythonpath = ["src"]
 
+[tool.poetry.scripts]
+extract_cbioportal = "missense_kinase_toolkit.cli.extract_cbioportal:main"
+transform_cbioportal = "missense_kinase_toolkit.cli.transform_cbioportal:main"
+
 [tool.poetry-dynamic-versioning]
 enable = true
 vcs = "git"
diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py
index 6ff52c5..67403ef 100644
--- a/src/missense_kinase_toolkit/cbioportal.py
+++ b/src/missense_kinase_toolkit/cbioportal.py
@@ -1,74 +1,16 @@
 #!/usr/bin/env python3
 
-from __future__ import annotations
-
 import os
 import pandas as pd
-import sys
 
 from bravado.client import SwaggerClient
 from bravado.requests_client import RequestsClient
 
-
-CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
-CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
-DATA_CACHE_DIR = "DATA_CACHE"
-CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT"
-
-
-def maybe_get_cbioportal_token_from_env(
-) -> str | None:
-    """Get the cBioPortal token from the environment
-
-    Returns
-    -------
-    str | None
-        cBioPortal token as string if exists, otherwise None
-    """
-    try:
-        token = os.environ[CBIOPORTAL_TOKEN_VAR]
-    except KeyError:
-        token = None
-
-    return token
-
-
-def maybe_get_cbioportal_instance_from_env(
-) -> str | None:
-    """Get the cBioPortal instance from the environment
-
-    Returns
-    -------
-    str | None
-        cBioPortal instance as string if exists, otherwise None
-    """
-    try:
-        instance = os.environ[CBIOPORTAL_INSTANCE_VAR]
-    except KeyError:
-        instance = None
-
-    return instance
-
-
-def maybe_get_cbioportal_cohort_from_env(
-) -> str | None:
-    """Get the cBioPortal instance from the environment
-
-    Returns
-    -------
-    str | None
-        cBioPortal instance as string if exists, otherwise None
-    """
-    try:
-        instance = os.environ[CBIOPORTAL_COHORT_VAR]
-    except KeyError:
-        print("Cohort not found in environment variables. This is necessary to run analysis. Exiting...")
-        sys.exit(1)
-
-    return instance
+from missense_kinase_toolkit import config, io_utils
 
 
 def get_all_mutations_by_study(
+    study_id: str,
 ) -> list | None:
     """Get mutations  cBioPortal data
 
@@ -77,19 +19,11 @@ def get_all_mutations_by_study(
     list | None
         cBioPortal data of Abstract Base Classes objects if successful, otherwise None
     """
-    token = maybe_get_cbioportal_token_from_env()
-
-    instance = maybe_get_cbioportal_instance_from_env()
-    if instance is not None:
-        url = f"https://{instance}/api/v2/api-docs"
-    else:
-        url = "https://cbioportal.org/api/v2/api-docs"
-
-    # Zehir, 2017 MSKCC sequencing cohort is "msk_impact_2017"
-    # MSKCC clinical sequencing cohort is "mskimpact"
-    study_id = maybe_get_cbioportal_cohort_from_env()
+    instance = config.get_cbioportal_instance()
+    url = f"https://{instance}/api/v2/api-docs"
+    token = config.maybe_get_cbioportal_token()
 
-    if all(v is not None for v in (token, instance)):
+    if token is not None:
         http_client = RequestsClient()
         http_client.set_api_key(
             instance,
@@ -160,42 +94,21 @@ def parse_iterabc2dataframe(
                 dict_dir[attr].append(None)
 
     df = pd.DataFrame.from_dict(dict_dir)
+    df = df[sorted(df.columns.to_list())]
 
     return df
 
 
-def save_cbioportal_data_to_csv(
-    df: pd.DataFrame,
+def get_and_save_cbioportal_cohort(
+    study_id: str,
 ) -> None:
-    """Save cBioPortal data to a CSV file
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Dataframe of cBioPortal data
+    muts = get_all_mutations_by_study(study_id)
 
-    Returns
-    -------
-    None
-    """
-    try:
-        path_data = os.environ[DATA_CACHE_DIR]
-        if not os.path.exists(path_data):
-            os.makedirs(path_data)
-        study_id = maybe_get_cbioportal_cohort_from_env()
-        df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
-    except KeyError:
-        print("DATA_CACHE not found in environment variables...")
-
-
-def main():
-    muts = get_all_mutations_by_study()
     df_muts = parse_iterabc2dataframe(muts)
     df_genes = parse_iterabc2dataframe(df_muts["gene"])
     df_combo = pd.concat([df_muts, df_genes], axis=1)
-    df_combo = df_combo.drop(['gene'], axis=1)
-    save_cbioportal_data_to_csv(df_combo)
+    df_combo = df_combo.drop(["gene"], axis=1)
 
+    filename = f"{study_id}_mutations.csv"
 
-if __name__ == "__main__":
-    main()
+    io_utils.save_dataframe_to_csv(df_combo, filename)
diff --git a/src/missense_kinase_toolkit/cli/__init__.py b/src/missense_kinase_toolkit/cli/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/missense_kinase_toolkit/cli/extract_cbioportal.py b/src/missense_kinase_toolkit/cli/extract_cbioportal.py
new file mode 100755
index 0000000..31dc887
--- /dev/null
+++ b/src/missense_kinase_toolkit/cli/extract_cbioportal.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+import argparse
+
+from missense_kinase_toolkit import config, cbioportal
+
+def parsearg_utils():
+    parser = argparse.ArgumentParser(
+        description="Get mutations from cBioPortal cohort and instance"
+    )
+
+    parser.add_argument(
+        "--cohort",
+        type=str,
+        help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)",
+        default="msk_impact_2017",
+    )
+
+    parser.add_argument(
+        "--outDir",
+        type=str,
+        help="Required: Output directory path (str)",
+    )
+
+    parser.add_argument(
+        "--instance",
+        type=str,
+        help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `www.cbioportal.org` (str)",
+        default="www.cbioportal.org",
+    )
+
+    parser.add_argument(
+        "--token",
+        type=str,
+        default="",
+        help="Optional: cBioPortal API token (str)",
+    )
+
+    # parser.add_argument(
+    #     "--requestsCache",
+    #     type=str,
+    #     default="",
+    #     help="Optional: Requests cache (str)",
+    # )
+
+    # TODO: add logging functionality
+    return parser
+
+
+def main():
+    args = parsearg_utils().parse_args()
+
+    str_studies = args.cohort
+    list_studies = str_studies.split(",")
+    list_studies = [study.strip() for study in list_studies]
+
+    # required argument
+    config.set_output_dir(args.outDir)
+
+    # optional arguments
+    config.set_cbioportal_instance(args.instance)
+
+    try:
+        if args.token != "":
+            config.set_cbioportal_token(args.token)
+    except AttributeError:
+        pass
+
+    # try:
+    #     if args.requestsCache != "":
+    #         config.set_request_cache(args.requestsCache)
+    # except AttributeError:
+    #     pass
+
+    for study in list_studies:
+        cbioportal.get_and_save_cbioportal_cohort(study)
diff --git a/src/missense_kinase_toolkit/cli/transform_cbioportal.py b/src/missense_kinase_toolkit/cli/transform_cbioportal.py
new file mode 100644
index 0000000..b2e815c
--- /dev/null
+++ b/src/missense_kinase_toolkit/cli/transform_cbioportal.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import argparse
+
+from missense_kinase_toolkit import config, scrapers, io_utils
+
+
+def parsearg_utils():
+    parser = argparse.ArgumentParser(
+        description="Concatenate, remove duplicates, and extract genes and mutation types of interest"
+    )
+
+    parser.add_argument(
+        "--mutations",
+        type=str,
+        help="Optional: Mutation type(s) to extract, separated by commas (e.g., `Missense_Mutation`) (str)",
+        default="",
+    )
+
+    parser.add_argument(
+        "--outDir",
+        type=str,
+        help="Required: Output directory path (str)",
+    )
+
+    parser.add_argument(
+        "--requestsCache",
+        type=str,
+        default="",
+        help="Optional: Requests cache (str)",
+    )
+
+    # TODO: add logging functionality
+    return parser
+
+
+def main():
+    args = parsearg_utils().parse_args()
+
+    str_mutations = args.mutations
+    list_mutations = str_mutations.split(",")
+    list_mutations = [mutation.strip() for mutation in list_mutations]
+
+    # required argument
+    config.set_output_dir(args.outDir)
+
+    try:
+        if args.requestsCache != "":
+            config.set_request_cache(args.requestsCache)
+    except AttributeError:
+        pass
+
+    df_cbioportal = io_utils.concatenate_csv_files_with_glob("*_mutations.csv")
+
+    df_kinhub = scrapers.kinhub()
+    io_utils.save_dataframe_to_csv(df_kinhub, "kinhub.csv")
+
+    list_kinase_hgnc = df_kinhub["HGNC Name"].to_list()
+
+    df_subset = df_cbioportal.loc[df_cbioportal["mutationType"].isin(list_mutations), ].reset_index(drop=True)
+    df_subset = df_subset.loc[df_subset["hugoGeneSymbol"].isin(list_kinase_hgnc), ].reset_index(drop=True)
+
+    list_cols = ["HGNC Name", "UniprotID"]
+    df_subset_merge = df_subset.merge(df_kinhub[list_cols],
+                                      how = "left",
+                                      left_on = "hugoGeneSymbol",
+                                      right_on = "HGNC Name")
+    df_subset_merge = df_subset_merge.drop(["HGNC Name"], axis=1)
+
+    io_utils.save_dataframe_to_csv(df_subset_merge, "transformed_mutations.csv")
diff --git a/src/missense_kinase_toolkit/config.py b/src/missense_kinase_toolkit/config.py
new file mode 100644
index 0000000..8efa52f
--- /dev/null
+++ b/src/missense_kinase_toolkit/config.py
@@ -0,0 +1,138 @@
+import os
+import sys
+
+
+OUTPUT_DIR_VAR = "OUTPUT_DIR"
+CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
+CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
+REQUEST_CACHE_VAR = "REQUESTS_CACHE"
+
+
+def set_output_dir(
+    val: str
+) -> None:
+    """Set the output directory in environment variables
+
+    Parameters
+    ----------
+    val : str
+        Output directory path
+
+    Returns
+    -------
+    None
+    """
+    os.environ[OUTPUT_DIR_VAR] = val
+
+
+def get_output_dir(
+) -> str | None:
+    """Get the output directory from the environment
+
+    Returns
+    -------
+    str | None
+        Output directory path if exists, otherwise None
+    """
+    try:
+        return os.environ[OUTPUT_DIR_VAR]
+    except KeyError:
+        print("Output directory not found in environment variables. This is necessary to run analysis. Exiting...")
+        sys.exit(1)
+
+
+def set_cbioportal_instance(
+    val: str
+) -> None:
+    """Set the cBioPortal instance in the environment variables
+
+    Parameters
+    ----------
+    val : str
+        cBioPortal instance; e.g., "cbioportal.mskcc.org" for MSKCC or
+
+    Returns
+    -------
+    None
+    """
+    os.environ[CBIOPORTAL_INSTANCE_VAR] = val
+
+
+def get_cbioportal_instance(
+) -> str | None:
+    """Get the cBioPortal instance from the environment
+
+    Returns
+    -------
+    str | None
+        cBioPortal instance as string if exists, otherwise None
+    """
+    try:
+        return os.environ[CBIOPORTAL_INSTANCE_VAR]
+    except KeyError:
+        print("cBioPortal isntance not found in environment variables. This is necessary to run analysis. Exiting...")
+        sys.exit(1)
+
+
+def set_cbioportal_token(
+    val: str
+) -> None:
+    """Set the cBioPortal token in the environment variables
+
+    Parameters
+    ----------
+    val : str
+        cBioPortal token
+
+    Returns
+    -------
+    None
+    """
+    os.environ[CBIOPORTAL_TOKEN_VAR] = val
+
+
+def maybe_get_cbioportal_token(
+) -> str | None:
+    """Get the cBioPortal token from the environment
+
+    Returns
+    -------
+    str | None
+        cBioPortal token as string if exists, otherwise None
+    """
+    try:
+        return os.environ[CBIOPORTAL_TOKEN_VAR]
+    except KeyError:
+        return None
+
+
+def set_request_cache(
+    val: str
+) -> None:
+    """Set the request cache path in environment variables
+
+    Parameters
+    ----------
+    val : str
+        Request cache path
+
+    Returns
+    -------
+    None
+    """
+    os.environ[REQUEST_CACHE_VAR] = val
+
+
+def maybe_get_request_cache(
+) -> str | None:
+    """Get the request cache path from the environment
+
+    Returns
+    -------
+    str | None
+        Request cache path as string if exists, otherwise None
+    """
+    try:
+        return os.environ[REQUEST_CACHE_VAR]
+    except KeyError:
+        return None
diff --git a/src/missense_kinase_toolkit/hgnc.py b/src/missense_kinase_toolkit/hgnc.py
index c5e616a..bc65a0e 100644
--- a/src/missense_kinase_toolkit/hgnc.py
+++ b/src/missense_kinase_toolkit/hgnc.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import requests
 
 from missense_kinase_toolkit import requests_wrapper, utils_requests
diff --git a/src/missense_kinase_toolkit/io_utils.py b/src/missense_kinase_toolkit/io_utils.py
index 2bc0285..e6f3188 100644
--- a/src/missense_kinase_toolkit/io_utils.py
+++ b/src/missense_kinase_toolkit/io_utils.py
@@ -2,7 +2,25 @@
 import pandas as pd
 
 
-DATA_CACHE_DIR = "DATA_CACHE"
+OUTPUT_DIR_VAR = "OUTPUT_DIR"
+
+
+def check_outdir_exists(
+) -> str:
+    """Check if OUTPUT_DIR in environmental variables and create directory if doesn't exist
+
+    Returns
+    -------
+    str
+    """
+    try:
+        path_data = os.environ[OUTPUT_DIR_VAR]
+        if not os.path.exists(path_data):
+            os.makedirs(path_data)
+    except KeyError:
+        print("OUTPUT_DIR not found in environment variables...")
+
+    return path_data
 
 
 def save_dataframe_to_csv(
@@ -15,19 +33,48 @@ def save_dataframe_to_csv(
     ----------
     df : pd.DataFrame
         Dataframe to save
-    output_path : str
-        Path to save the CSV file
+    filename : str
+        Filename to save (either with or without "csv" suffix)
+
 
     Returns
     -------
     None
     """
     filename = filename.replace(".csv", "") + ".csv"
+    path_data = check_outdir_exists()
+    df.to_csv(os.path.join(path_data, filename), index=False)
 
-    try:
-        path_data = os.environ[DATA_CACHE_DIR]
-        if not os.path.exists(path_data):
-            os.makedirs(path_data)
-        df.to_csv(os.path.join(path_data, f"{filename}_mutations.csv"), index=False)
-    except KeyError:
-        print("DATA_CACHE not found in environment variables...")
+
+def concatenate_csv_files_with_glob(
+    str_find: str,
+) -> pd.DataFrame:
+    """Use glob to find csv files to concatenate
+
+    Parameters
+    ----------
+    str_find: str
+        String to use to find files containing csv files of interest
+
+    Return
+    ------
+    pd.DataFrame
+        Concatenated dataframe
+    """
+    import glob
+
+    str_find = str_find.replace(".csv", "") + ".csv"
+    path_data = check_outdir_exists()
+    csv_files = glob.glob(os.path.join(path_data, str_find))
+
+    df_combo = pd.DataFrame()
+    if len(csv_files) > 0:
+        for csv_file in csv_files:
+            df = pd.read_csv(csv_file)
+            df_combo = pd.concat([df_combo, df])
+    else:
+        print(f"No files matching {str_find} found in {path_data}...")
+
+    #TODO: implement remove duplicates
+
+    return df_combo
diff --git a/src/missense_kinase_toolkit/pfam.py b/src/missense_kinase_toolkit/pfam.py
index f623e7f..d398326 100644
--- a/src/missense_kinase_toolkit/pfam.py
+++ b/src/missense_kinase_toolkit/pfam.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import json
 
 import pandas as pd
diff --git a/src/missense_kinase_toolkit/scrapers.py b/src/missense_kinase_toolkit/scrapers.py
index d36b3ae..21ca8c0 100644
--- a/src/missense_kinase_toolkit/scrapers.py
+++ b/src/missense_kinase_toolkit/scrapers.py
@@ -1,7 +1,10 @@
 import pandas as pd
 
-def scrape_kinhub(
-    url: str ='http://www.kinhub.org/kinases.html'
+from missense_kinase_toolkit import requests_wrapper
+
+
+def kinhub(
+    url: str = "http://www.kinhub.org/kinases.html",
 ) -> pd.DataFrame:
     """Scrape the KinHub database for kinase information
 
@@ -24,7 +27,7 @@ def scrape_kinhub(
     # perhaps just write own function to clean column names
     # from janitor import clean_names
 
-    page = requests.get(url)
+    page = requests_wrapper.get_cached_session().get(url)
     soup = BeautifulSoup(page.content, "html.parser")
 
     list_header = [t for tr in soup.select('tr') for t in tr if t.name == 'th']
@@ -49,4 +52,10 @@ def scrape_kinhub(
     df_kinhub = pd.DataFrame.from_dict(dict_kinhub)
     # df_kinhub = clean_names(df_kinhub)
 
-    return df_kinhub
+    # for kinases with 2 kinase domains, entries are duplicated despite same UniProt ID
+    # drop these
+    df_kinhub_drop = df_kinhub.loc[~df_kinhub["Manning Name"].apply(lambda x: "Domain2_" in str(x)), ]
+    # list_uniprot = df_kinhub["UniprotID"][df_kinhub["Manning Name"].apply(lambda x: "Domain2_" in str(x))].to_list()
+    # assert df_kinhub.shape[0] - df_kinhub_drop.shape[0] == df_kinhub_drop["UniprotID"].isin(list_uniprot).sum()
+
+    return df_kinhub_drop
diff --git a/src/nextflow/README.MD b/src/nextflow/README.MD
new file mode 100644
index 0000000..c27e1b1
--- /dev/null
+++ b/src/nextflow/README.MD
@@ -0,0 +1,24 @@
+# NextFlow workflow to run pipeline
+
+To run: `nextflow run main.nf -params-file params.json`
+
+Generate own `params.json` file using the following parameters:
+```
+{
+    "CBIOPORTAL_COHORT"     : "TODO",
+    "OUTPUT_DIR"            : "TODO",
+    "CBIOPORTAL_INSTANCE"   : "TODO",
+    "CBIOPORTAL_TOKEN"      : "TODO",
+    "REQUESTS_CACHE"        : "TODO"
+}
+```
+
+Below is a description of what each variable should contain. If variable is optional and not in use, do not create any entry in the `json` file.
+
+| Variable             | Optional | Description |
+| :--------------------| :------: | :---------- |
+| 'CBIOPORTAL_COHORT'  |    No    | cBioPortal cohort to analyze |
+| 'OUTPUT_DIR'         |    No    | Path to outdir to save data |
+| 'CBIOPORTAL_INSTANCE'|    Yes   | `cbioportal.org` if none provided |
+| 'CBIOPORTAL_TOKEN'   |    Yes   | Data Access Token if using private instance|
+| 'REQUESTS_CACHE'     |    Yes   | Path to dir to cache requests data |
diff --git a/src/nextflow/extract_cbioportal.nf b/src/nextflow/extract_cbioportal.nf
new file mode 100644
index 0000000..b9462ca
--- /dev/null
+++ b/src/nextflow/extract_cbioportal.nf
@@ -0,0 +1,16 @@
+process PROCESS_CBIOPORTAL {
+    input:
+    tuple val(cbio_cohort), path(out_dir), val(cbio_inst), val(cbio_token), path(request_cache)
+
+    output:
+    path("${out_dir}/cbioportal")
+    """
+    export PYTHONHASHSEED=0
+    process_cbioportal \
+        --cohort ${cbio_cohort} \
+        --outDir ${out_dir} \
+        --instance ${cbio_inst} \
+        --token ${cbio_token} \
+        --requestsCache ${request_cache}
+    """
+}