diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0e16d21..788c38a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -8,7 +8,7 @@ jobs:
     docker:
       # specify the version you desire here
       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
-      - image: cimg/python:3.8.0
+      - image: continuumio/miniconda3
 
       # Specify service dependencies here if necessary
       # CircleCI maintains a library of pre-built images
@@ -19,39 +19,38 @@ jobs:
 
     steps:
       - checkout
-
+      
       - run:
-          name: Set up Anaconda
+          name: Set up Conda
           command: |
-            wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh;
-            chmod +x ~/miniconda.sh;
-            ~/miniconda.sh -b -p ~/miniconda;
-            export PATH=~/miniconda/bin:$PATH
-            echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV;
-            conda update --yes --quiet conda;
             conda init bash
-            sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV
-
+            conda update --yes --quiet conda;
+            export CONDA_EXE=/opt/conda/bin/conda
+            sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV 
+      
       - run:
           name: Build cookiecutter environment and test-env project
           command: |
-            conda create -n cookiecutter --yes python=3.8
+            conda create -n cookiecutter --yes python=3.8 make
             conda activate cookiecutter
             pip install cookiecutter
             pip install ruamel.yaml
-            mkdir /home/circleci/.cookiecutter_replay
-            cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json
+            mkdir -p /root/repo/.cookiecutter_replay
+            cp circleci-cookiecutter-easydata.json /root/repo/.cookiecutter_replay/cookiecutter-easydata.json
             pwd
+            which make
             cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input
-            conda deactivate
 
       - run:
           name: Create test-env environment and contrive to always use it
           command: |
+            conda activate cookiecutter
             cd test-env
-            export CONDA_EXE=/home/circleci/miniconda/bin/conda
+            export CONDA_EXE=/opt/conda/bin/conda
             make create_environment
+            python scripts/tests/add-extra-channel-dependency.py
             conda activate test-env
+            conda install -c anaconda make
             touch environment.yml
             make update_environment
             echo "conda activate test-env" >> $BASH_ENV;
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index b110146..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-language: python
-
-cache:
-  directories:
-  - $HOME/.cache/pip
-
-python:
-  - "3.8"
-
-envs:
-  - REQUIRED_PYTHON="python3"
-
-install:
-  # install miniconda
-  - deactivate
-  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
-  - MINICONDA_PATH=/home/travis/miniconda3
-  - chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
-  - chmod +x $MINICONDA_PATH
-  - export PATH=$MINICONDA_PATH/condabin:$PATH
-  - conda update --yes conda
-  # create cookiecutter environment
-  - conda create -n cookiecutter --yes python=3.8
-  - conda init bash
-  - . ~/.bashrc
-  - conda activate cookiecutter
-  - pip install cookiecutter
-  - pip install ruamel.yaml
-
-script:
-  - pwd
-  # build a cookiecutter project test-env
-  - cookiecutter --config-file .cookiecutter-easydata-test.yml . -f --no-input
-  - conda deactivate
-  # create the environment from test-env
-  - cd test-env
-  - make create_environment
-  - conda activate test-env
-  - touch environment.yml
-  - make update_environment
-  # create test dataset
-  - python src/tests/make_test_datasets.py
-  # run tests on the src module
-  - export CI_RUNNING=yes
-  - make test_with_coverage
-  # test notebooks in docs
-  - pytest -v ../docs/test_docs.py
-
-after_success:
-  - conda activate test-env
-  - coveralls
\ No newline at end of file
diff --git a/README.md b/README.md
index 2f2a732..7497c45 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,24 @@ python -m pip install -f requirements.txt
 
     cookiecutter https://github.com/hackalog/easydata
 
+### To find out more
+------------
+A good place to start is with reproducible environments. We have a tutorial here: [Getting Started with EasyData Environments](https://github.com/hackalog/easydata/wiki/Getting-Started-with-EasyData-Environments). 
+
+The next place to look is in the customized documentation that is in any EasyData created repo. It is customized to the settings that you put in your template. These are reference documents that can be found under `references/easydata` that are customized to your repo that cover:
+   * more on conda environments
+   * more on paths
+   * git configuration (including setting up ssh with GitHub)
+   * git workflows
+   * tricks for using Jupyter notebooks in an EasyData environment
+   * troubleshooting
+   * recommendations for how to share your work
+   
+Furthermore, see:
+* [The EasyData documentation on read the docs](https://cookiecutter-easydata.readthedocs.io/en/latest/?badge=latest): this contains up-to-date working exmaples of how to use EasyData for reproducible datasets and some ways to use notebooks reproducibly
+* [Talks and Tutorials based on EasyData](https://github.com/hackalog/easydata/wiki/EasyData-Talks-and-Tutorials)
+* [Catalog of EasyData Documentation](https://github.com/hackalog/easydata/wiki/Catalog-of-EasyData-Documentation)
+* [The EasyData wiki](https://github.com/hackalog/easydata/wiki) Check here for further troubleshooting and how-to guides for particular problems that aren't in the `references/easydata` docs (including a `git` tutorial)
 
 ### The resulting directory structure
 ------------
diff --git a/cookiecutter.json b/cookiecutter.json
index d411e76..cf3153e 100644
--- a/cookiecutter.json
+++ b/cookiecutter.json
@@ -1,12 +1,12 @@
 {
     "project_name": "project_name",
     "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}",
-    "default_branch": ["master", "main"],
+    "default_branch": ["main", "master"],
     "module_name": "src",
-    "author_name": "Your name (or your organization/company/team)",
+    "author_name": "Your name (or the copyright holder)",
     "description": "A short description of this project.",
     "open_source_license": ["MIT", "BSD-2-Clause", "Proprietary"],
-    "python_version": ["3.7", "3.6", "latest", "3.8"],
+    "python_version": ["latest", "3.11", "3.10", "3.9", "3.8", "3.7"],
     "conda_path": "~/anaconda3/bin/conda",
     "upstream_location": ["github.com", "gitlab.com", "bitbucket.org", "your-custom-repo"]
 }
diff --git a/docs/00-xyz-sample-notebook.ipynb b/docs/00-xyz-sample-notebook.ipynb
index a089002..cc90381 100644
--- a/docs/00-xyz-sample-notebook.ipynb
+++ b/docs/00-xyz-sample-notebook.ipynb
@@ -150,7 +150,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {
diff --git a/docs/Add-csv-template.ipynb b/docs/Add-csv-template.ipynb
index ad69434..ad1e37d 100644
--- a/docs/Add-csv-template.ipynb
+++ b/docs/Add-csv-template.ipynb
@@ -83,7 +83,7 @@
     "* `csv_path`: The desired path to your .csv file (in this case `epidemiology.csv`) relative to paths['raw_data_path']\n",
     "* `download_message`: The message to display to indicate to the user how to manually download your .csv file.\n",
     "* `license_str`: Information on the license for the dataset\n",
-    "* `descr_str`: Information on the dataset itself"
+    "* `readme_str`: Information on the dataset itself"
    ]
   },
   {
@@ -123,7 +123,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "descr_str = \"\"\"\n",
+    "readme_str = \"\"\"\n",
     "The epidemiology table from Google's [COVID-19 Open-Data dataset](https://github.com/GoogleCloudPlatform/covid-19-open-data). \n",
     "\n",
     "The full dataset contains datasets of daily time-series data related to COVID-19 for over 20,000 distinct locations around the world. The data is at the spatial resolution of states/provinces for most regions and at county/municipality resolution for many countries such as Argentina, Brazil, Chile, Colombia, Czech Republic, Mexico, Netherlands, Peru, United Kingdom, and USA. All regions are assigned a unique location key, which resolves discrepancies between ISO / NUTS / FIPS codes, etc. The different aggregation levels are:\n",
@@ -170,7 +170,7 @@
     "                                               csv_path=csv_path,\n",
     "                                               download_message=download_message,\n",
     "                                               license_str=license_str,\n",
-    "                                               descr_str=descr_str,\n",
+    "                                               readme_str=readme_str,\n",
     "                                               overwrite_catalog=True)"
    ]
   },
@@ -206,9 +206,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to  in `ds.EXTRA`.\n",
+    "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to  in `ds.FILESET`.\n",
     "\n",
-    "The `.EXTRA` functionality is covered in other documentation."
+    "The `.FILESET` functionality is covered in other documentation."
    ]
   },
   {
@@ -236,7 +236,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds_raw.EXTRA"
+    "ds_raw.FILESET"
    ]
   },
   {
@@ -246,7 +246,7 @@
    "outputs": [],
    "source": [
     "# fq path to epidemiology.csv file\n",
-    "ds_raw.extra_file('epidemiology.csv')"
+    "ds_raw.fileset_file('epidemiology.csv')"
    ]
   },
   {
diff --git a/docs/Add-derived-dataset.ipynb b/docs/Add-derived-dataset.ipynb
index e639190..d5e93e4 100644
--- a/docs/Add-derived-dataset.ipynb
+++ b/docs/Add-derived-dataset.ipynb
@@ -85,7 +85,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {
@@ -219,7 +219,7 @@
     "    source_dataset_name\n",
     "    dataset_name\n",
     "    data_function\n",
-    "    added_descr_txt\n",
+    "    added_readme_txt\n",
     "\n",
     "We'll want our `data_function` to be defined in the project module (in this case `src`) for reproducibility reasons (which we've already done with `subselect_by_key` above)."
    ]
@@ -250,7 +250,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "added_descr_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n",
+    "added_readme_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n",
     "to the {key} dataset.\"\"\""
    ]
   },
@@ -281,7 +281,7 @@
     "        source_dataset_name=source_dataset_name,\n",
     "        dataset_name=dataset_name,\n",
     "        data_function=data_function,\n",
-    "        added_descr_txt=added_descr_txt,\n",
+    "        added_readme_txt=added_readme_txt,\n",
     "        overwrite_catalog=True)"
    ]
   },
@@ -318,7 +318,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {
diff --git a/docs/New-Dataset-Template.ipynb b/docs/New-Dataset-Template.ipynb
index bcf7826..abb8e88 100644
--- a/docs/New-Dataset-Template.ipynb
+++ b/docs/New-Dataset-Template.ipynb
@@ -167,7 +167,7 @@
    "metadata": {},
    "source": [
     "### Create a process function\n",
-    "By default, we recommend that you use the `process_extra_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own."
+    "By default, we recommend that you use the `process_fileset_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own."
    ]
   },
   {
@@ -176,11 +176,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.data.extra import process_extra_files\n",
-    "process_function = process_extra_files\n",
+    "from src.data.fileset import process_fileset_files\n",
+    "process_function = process_fileset_files\n",
     "process_function_kwargs = {'file_glob':'*.csv',\n",
     "                           'do_copy': True,\n",
-    "                           'extra_dir': ds_name+'.extra',\n",
+    "                           'fileset_dir': ds_name+'.fileset',\n",
     "                           'extract_dir': ds_name}"
    ]
   },
@@ -355,7 +355,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds.EXTRA"
+    "ds.FILESET"
    ]
   },
   {
@@ -364,7 +364,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds.extra_file('epidemiology.csv')"
+    "ds.fileset_file('epidemiology.csv')"
    ]
   },
   {
diff --git a/docs/New-Edge-Template.ipynb b/docs/New-Edge-Template.ipynb
index 6a1c5bb..3b1058e 100644
--- a/docs/New-Edge-Template.ipynb
+++ b/docs/New-Edge-Template.ipynb
@@ -88,7 +88,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "source_ds.EXTRA"
+    "source_ds.FILESET"
    ]
   },
   {
@@ -178,7 +178,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {
diff --git a/docs/test_docs.py b/docs/test_docs.py
index 045cc56..7e8d17a 100644
--- a/docs/test_docs.py
+++ b/docs/test_docs.py
@@ -9,6 +9,8 @@
 import requests
 
 from src import paths
+from src.log import logger
+
 
 CCDS_ROOT = Path(__file__).parents[1].resolve()
 DOCS_DIR = CCDS_ROOT / "docs"
@@ -35,6 +37,7 @@ def test_notebook_csv(self):
         csv_url = "https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv"
         csv_dest = paths['raw_data_path'] / "epidemiology.csv"
         if not csv_dest.exists():
+            logger.debug("Downloading epidemiology.csv")
             csv_file = requests.get(csv_url)
             with open(csv_dest, 'wb') as f:
                 f.write(csv_file.content)
diff --git a/{{ cookiecutter.repo_name }}/.circleci/config.yml b/{{ cookiecutter.repo_name }}/.circleci/config.yml
index 86db8c0..98373ef 100644
--- a/{{ cookiecutter.repo_name }}/.circleci/config.yml	
+++ b/{{ cookiecutter.repo_name }}/.circleci/config.yml	
@@ -8,7 +8,8 @@ jobs:
     docker:
       # specify the version you desire here
       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
-      - image: circleci/python:3.7.0
+      - image: continuumio/miniconda3
+
 
       # Specify service dependencies here if necessary
       # CircleCI maintains a library of pre-built images
@@ -20,14 +21,6 @@ jobs:
     steps:
       - checkout
 
-      - run:
-          name: Set up Anaconda
-          command: |
-            wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh;
-            chmod +x ~/miniconda.sh;
-            ~/miniconda.sh -b -p ~/miniconda;
-            echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV;
-
       - run:
           name: Create environment and contrive to always use it
           command: |
diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile
index addf322..15ba76e 100644
--- a/{{ cookiecutter.repo_name }}/Makefile	
+++ b/{{ cookiecutter.repo_name }}/Makefile	
@@ -75,17 +75,12 @@ test: update_environment
 		$(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \
 		$(MODULE_NAME)
 
-## Run all Unit Tests with coverage
+## Run all Unit and code coverage tests
 test_with_coverage: update_environment
 	$(SET) LOGLEVEL=DEBUG; coverage run -m pytest --pyargs --doctest-modules --doctest-continue-on-failure --verbose \
 		$(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \
 		$(MODULE_NAME)
 
-.PHONY: lint
-## Lint using flake8
-lint:
-	flake8 $(MODULE_NAME)
-
 .phony: help_update_easydata
 help_update_easydata:
 	@$(PYTHON_INTERPRETER) scripts/help-update.py
@@ -105,7 +100,7 @@ debug:
 # Self Documenting Commands                                                     #
 #################################################################################
 
-HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM
+HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM SHELL
 
 .DEFAULT_GOAL := show-help
 .PHONY: show-help
diff --git a/{{ cookiecutter.repo_name }}/Makefile.envs b/{{ cookiecutter.repo_name }}/Makefile.envs
index 4c65eb7..43396df 100644
--- a/{{ cookiecutter.repo_name }}/Makefile.envs	
+++ b/{{ cookiecutter.repo_name }}/Makefile.envs	
@@ -4,28 +4,20 @@
 
 include Makefile.include
 
-$(LOCKFILE): check_installation .make.bootstrap .make.pip-requirements.txt .make.environment-default.yml .make.conda-forge-requirements.txt
+$(LOCKFILE): check_installation .make.bootstrap split_environment_files
 ifeq (conda, $(VIRTUALENV))
-	$(CONDA_EXE) env update -n $(PROJECT_NAME) -f .make.environment-default.yml --prune
-	$(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.conda-forge-requirements.txt --channel defaults --channel conda-forge --strict-channel-priority --yes
+	$(foreach channel, $(shell $(CAT) .make.channel-order.include),\
+	   $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.$(channel)-environment.txt --channel defaults --channel $(channel) --strict-channel-priority --yes $(CMDSEP))
 	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture pip install -r .make.pip-requirements.txt
 	$(CONDA_EXE) env export -n $(PROJECT_NAME) -f $(LOCKFILE)
 else
 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
 endif
 
-# extract multi-phase dependencies from environment.yml
-.make.environment-pip.yml: environment.yml .make.bootstrap
-	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py pip-yaml $(PROJECT_DIR)environment.yml > $@
-
-.make.pip-requirements.txt: environment.yml .make.bootstrap
-	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py pip $(PROJECT_DIR)environment.yml > $@
-
-.make.conda-forge-requirements.txt: environment.yml .make.bootstrap
-	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py conda-forge $(PROJECT_DIR)environment.yml > $@
-
-.make.environment-default.yml: environment.yml .make.bootstrap
-	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py default $(PROJECT_DIR)environment.yml > $@
+.PHONY: split_environment_files
+# extract multi-phase dependencies from environment.yml and create ordering file
+split_environment_files: environment.yml .make.bootstrap
+	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py $(PROJECT_DIR)environment.yml
 
 .make.bootstrap: scripts/bootstrap.yml
 	$(CONDA_EXE) env update -n $(PROJECT_NAME) -f scripts/bootstrap.yml
@@ -69,6 +61,7 @@ endif
 # Checks that the conda environment is active
 environment_enabled:
 ifeq (conda,$(VIRTUALENV))
+	$(CONDA_EXE) config --env --set channel_priority strict
 ifneq ($(notdir ${CONDA_DEFAULT_ENV}), $(PROJECT_NAME))
 	$(error Run "$(VIRTUALENV) activate $(PROJECT_NAME)" before proceeding...)
 endif
diff --git a/{{ cookiecutter.repo_name }}/Makefile.include b/{{ cookiecutter.repo_name }}/Makefile.include
index e8486ca..85854ee 100644
--- a/{{ cookiecutter.repo_name }}/Makefile.include	
+++ b/{{ cookiecutter.repo_name }}/Makefile.include	
@@ -19,5 +19,4 @@ CAT ?= cat
 SET ?= export
 WHICH ?= which
 DEVNULL ?= /dev/null
-
-$(warning From here on, using SHELL = $(SHELL))
+CMDSEP ?= ;
diff --git a/{{ cookiecutter.repo_name }}/Makefile.win32 b/{{ cookiecutter.repo_name }}/Makefile.win32
index 92d8800..de046eb 100644
--- a/{{ cookiecutter.repo_name }}/Makefile.win32	
+++ b/{{ cookiecutter.repo_name }}/Makefile.win32	
@@ -5,6 +5,7 @@ CAT = type
 SET = set
 WHICH = where
 DEVNULL = nul
+CMDSEP = &
 
 # Some UNIXish packages force the installation of a Bourne-compatible shell, and Make
 # prefers using this when it sees it. We thus force the usage of the good ole Batch
diff --git a/{{ cookiecutter.repo_name }}/environment.yml b/{{ cookiecutter.repo_name }}/environment.yml
index 6749871..5982a14 100644
--- a/{{ cookiecutter.repo_name }}/environment.yml	
+++ b/{{ cookiecutter.repo_name }}/environment.yml	
@@ -1,6 +1,6 @@
 {% macro pyver() -%}
 {% if cookiecutter.python_version == 'latest' -%}
-  - python=3
+  - python
 {% else -%}
   - python={{ cookiecutter.python_version }}
 {% endif -%}
diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md
index e698b52..60a9a9f 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md	
@@ -4,13 +4,19 @@ The `{{ cookiecutter.repo_name }}` repo is set up with template code to make man
 
 If you haven't yet, configure your conda environment.
 
+**WARNING**: If you have conda-forge listed as a channel in your `.condarc` (or any other channels other than defaults), you may experience great difficulty generating reproducible conda environments.
+
+We recommend you remove conda-forge (and all other non-default channels) from your `.condarc` file and [set your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html). You can still use conda-forge (or any other conda channel), just specify it explicitly in your `environment.yml` by prefixing your package name with `channel-name::`; e.g.
+```
+  - wheel                    # install from the default (anaconda) channel
+  - pytorch::pytorch         # install this from the `pytorch` channel
+  - conda-forge::tokenizers  # install this from conda-forge
+```
+
 ## Configuring your python environment
 Easydata uses conda to manage python packages installed by both conda **and pip**.
 
 ### Adjust your `.condarc`
-**WARNING FOR EXISTING CONDA USERS**: If you have `conda-forge` listed as a channel in your `.condarc` (or any other channels other than `default`), **remove them**. These channels should be specified in `environment.yml` instead.
-
-We also recommend [setting your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html) to reduce package incompatibility problems. This will be the default in conda 5.0, but in order to assure reproducibility, we need to use this behavior now.
 
 ```
 conda config --set channel_priority strict
@@ -26,18 +32,30 @@ conda config --prepend channels defaults
 conda config --prepend envs_dirs ~/.conda/envs   # Store environments in local dir for JupyterHub
 ```
 
-### Fix the CONDA_EXE path
-* Make note of the path to your conda binary:
+#### Locating the `conda` binary
+Ensure the Makefile can find your conda binary, either by setting the `CONDA_EXE` environment variable, or by modifying `Makefile.include` directly.
+
+First, check if `CONDA_EXE` is already set
+```
+   >>> export | grep CONDA_EXE
+   CONDA_EXE=/Users/your_username/miniconda3/bin/conda
+```
+
+If `CONDA_EXE` is not set, you will need to set it manually in `Makefile.include`; i.e.
+
+* Make note of the path to your conda binary. It should be in the `bin` subdirectory of your Anaconda (or miniconda) installation directory:
 ```
-   $ which conda
+   >>>  which conda         # this will only work if conda is in your PATH, otherwise, verify manually
    ~/miniconda3/bin/conda
 ```
-* ensure your `CONDA_EXE` environment variable is set correctly in `Makefile.include`
+* ensure your `CONDA_EXE` environment variable is set to this value; i.e.
 ```
-    export CONDA_EXE=~/miniconda3/bin/conda
+    >>> export CONDA_EXE=~/miniconda3/bin/conda
 ```
+or edit `Makefile.include` directly.
+
 ### Create the conda environment
-* Create and switch to the virtual environment:
+Create and switch to the virtual environment:
 ```
 cd {{ cookiecutter.repo_name }}
 make create_environment
@@ -63,6 +81,7 @@ When adding packages to your python environment, **do not `pip install` or `cond
 Your `environment.yml` file will look something like this:
 ```
 name: {{ cookiecutter.repo_name }}
+dependencies:
   - pip
   - pip:
     - -e .  # conda >= 4.4 only
@@ -88,7 +107,7 @@ name: {{ cookiecutter.repo_name }}
 ```
 To add any package available from conda, add it to the end of the list. If you have a PYPI dependency that's not avaible via conda, add it to the list of pip installable dependencies under `  - pip:`.
 
-You can include any {{ cookiecutter.upstream_location }} python-based project in the `pip` section via `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>`.
+You can include any `{{ cookiecutter.upstream_location }}` python-based project in the `pip` section via `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>`.
 
 In particular, if you're working off of a fork or a work in progress branch of a repo in {{ cookiecutter.upstream_location }} (say, your personal version of <package>), you can change `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>` to
 
@@ -99,6 +118,43 @@ Once you're done your edits, run `make update_environment` and voila, you're upd
 
 To share your updated environment, check in your `environment.yml` file. (More on this in [Sharing your Work](sharing-your-work.md))
 
+#### Adding packages from other conda channels
+Say we want to add a package only available from the `conda-forge` conda channel and not the default conda channel. (The conda channel is what follows `-c` when using `conda install -c my-channel my-package`. Suppose we want to use `make` on windows. Then we need to use `conda-forge` since the default conda channel only has linux and macOS installations of `make`. To normally conda install this, we would use `conda install -c conda-forge make`. **We won't do that here**.
+
+Instead, we add a `channel-order` section that starts with `defaults` and lists the other channels we want to use in the order we want to install from them (note that this is a custom EasyData section to the `environment.yml`). Then we add our package in the dependency list in the form `channel-name::package-name`, for example, `conda-forge::make`.
+
+In this case an updated `environment.yml` file looks like this:
+```
+name: {{ cookiecutter.repo_name }}
+channel-order:
+  - defaults
+  - conda-forge
+dependencies:
+  - pip
+  - pip:
+    - -e .  # conda >= 4.4 only
+    - python-dotenv>=0.5.1
+    - nbval
+    - nbdime
+    - umap-learn
+    - gdown
+  - setuptools
+  - wheel
+  - git>=2.5  # for git worktree template updating
+  - sphinx
+  - bokeh
+  - click
+  - colorcet
+  - coverage
+  - coveralls
+  - datashader
+  - holoviews
+  - matplotlib
+  - jupyter
+  - conda-forge::make
+...
+```
+
 
 #### Lock files
 Now, we'll admit that this workflow isn't perfectly reproducible in the sense that conda still has to resolve versions from the `environment.yml`. To make it more reproducible, running either `make create_environment` or `make update_environment` will generate an `environment.{$ARCH}.lock.yml` (e.g. `environment.i386.lock.yml`). This file keeps a record of the exact environment that is currently installed in your conda environment `{{ cookiecutter.repo_name }}`. If you ever need to reproduce an environment exactly, you can install from the `.lock.yml` file. (Note: These are architecture dependent).
diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md
index 5b54c1e..ff5923c 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md	
@@ -3,8 +3,8 @@
 ## TL;DR
 To get started, all you really need to know is that you can query for available datasets via
 ```python
-from {{ cookiecutter.module_name }} import workflow
-workflow.dataset_catalog()
+from {{ cookiecutter.module_name }}.data import Catalog
+Catalog.load("datasets")
 ```
 
 and load these datasets via
@@ -15,15 +15,18 @@ ds = Dataset.load(dataset_name)
 
 If you've followed the instructions from building the repo contained in the [README](../README.md), this should just work (if it doesn't, please let us know)!
 
-You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.DESCR` and `ds.LICENSE`.
+You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.README` and `ds.LICENSE`.
 
-**Warning**: some of the datasets can be quite large. If you want to store your data externally, we recommend symlinking your data directory (that is the `{{ cookiecutter.repo_name }}/data` directory) to somewhere with more room before loading your first `Dataset`.
+**Disk Space Note**: sometimes datasets can be quite large. If you want to store your data externally, we recommend pointing your data directory to a new location; that is,
 
+```python
+from {{ cookiecutter.module_name }} import paths
+paths["data_path"] = "/path/to/big/data/directory"
+```
 
 ## Digging Deeper
 It is useful to know a little bit more about how Datasets work.
 
-
 ## What is a `Dataset` object?
 
 A Dataset is the fundamental object we use for turning raw data into useful datasets, reproducibly. It is like a scikit-learn-style `Bunch` object --- essentially, a dictionary with some extra magic to make it nicer to work with --- containing the following attributes:
@@ -36,7 +39,7 @@ A Dataset is the fundamental object we use for turning raw data into useful data
 
 The `data` attribute can really be any processed data form that you like: sometimes it's a pandas dataframe (like with `wine_reviews_130k`), a list of tuples containing other data, (`reddit_comment_tree_graphs`), or other formats including  `scipy.sparse` matrices or `igraph` graphs. The `target` (if you're using it), expects something that matches the `data` in terms of length.
 
-For a hint as to which data format to expect, you can look at the contents of the `DESCR` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object.
+For a hint as to which data format to expect, you can look at the contents of the `README` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object.
 
 This `metadata` is where things get interesting... which we'll cover on its own next.
 
@@ -44,9 +47,9 @@ This `metadata` is where things get interesting... which we'll cover on its own
 The `metadata` is where the magic lives. It serves several purposes in terms of bookkeeping:
 
 * it includes `HASHES`, which **improve data reproducibility**, since what you download and process gets checked each step along the way to ensure the raw data matches what is stored in the `dataset_catalog`,
-* it provides easy access to **what the data is** via the `DESCR` attribute,
+* it provides easy access to **what the data is** via the `README` attribute,
 * it provides easy (and continual) **access to the license / usage restrictions** for the data (the `LICENSE` attribute), which helps with knowing what you can do when [Sharing your Work](sharing-your-work.md).
-* it provides the **extra data manifest**, `EXTRA`, if your dataset includes around additional raw data (extra) files.
+* it provides the **fileset data manifest**, `FILESET`, if your dataset includes around additional raw data (fileset) files.
 
 In short, it helps you to know what data you're working with, what you can do with it, and whether something has gone wrong.
 
@@ -73,21 +76,19 @@ ds.metadata
 
 To access the most common metadata fields:
 ```python
-ds.DESCR          # or ds.metadata['descr']
+ds.README          # or ds.metadata['readme']
 ds.LICENSE        # or ds.metadata['license']
 ds.HASHES         # or ds.metadata['hashes']
 ```
 ## The catalog
-While we do our best to keep the documentation in [Available Datasets](docs/available-datasets.md) up-to-date with what's in the code, you can explore all of the currently available `Datasets` via the `dataset_catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded.
+You can explore all of the currently available `Datasets` via the Dataset `Catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded.
 
 To access the catalog:
 
 ```python
-from {{ cookiecutter.module_name }} import workflow
-workflow.dataset_catalog(keys_only=True)
+from {{ cookiecutter.module_name }}.data import Catalog
+Catalog.load("datasets")
 ```
-If you're interested, set `keys_only=False` to see the complete contents of the metadata that is saved in the catalog.
-
 
 ## Sharing your Data as a `Dataset` object
 In order to convert your data to a `Dataset` object, you will need to generate a catalog *recipe*, that uses a custom *function for processing your raw data*. Doing so allows us to document all the munging, pre-processing, and data verification necessary to reproducibly build the dataset.
diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md
index ce9e87d..50d5179 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md	
@@ -1,5 +1,5 @@
-# The Easydata Git Workflow
-Here's our suggestion for a reliable git workflow that works well in small team settings using [Easydata][cookiecutter-easydata].
+# The EasyData Git Workflow
+Here's our suggestion for a reliable git workflow that works well in **small team settings**; e.g. when using [Easydata][easydata] in a group setting.
 
 ## Git configuration
 
@@ -49,7 +49,7 @@ git merge {{cookiecutter.default_branch}}
 git push origin my_branch
 ```
 
-### Do I have any stale branches?
+### Clean up the junk
 With your local `{{cookiecutter.default_branch}}`, `origin/{{cookiecutter.default_branch}}` and `upstream/{{cookiecutter.default_branch}}` all in sync, we like to clean up any old branches that are fully merged (and hence, can be deleted without data loss.)
 ```bash
 git branch --merged {{cookiecutter.default_branch}}
@@ -58,15 +58,15 @@ git branch -d <name_of_merged_branch>
 A really great feature of `git branch -d` is that it will refuse to remove a branch that hasn't been fully merged into another. Thus it's safe to use without any fear of data loss.
 
 
-### Time to start the day
+### Start the day
 Once you've finished all your merge tasks, you can create a clean working branch from the latest `{{cookiecutter.default_branch}}` by doing a:
 ```bash
 git checkout {{cookiecutter.default_branch}}
 git checkout -b new_branch_name
 ```
 
+That's it! Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue in our
+[easydata issue tracker].
 
-That's it!. Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue at
-[cookiecutter-easydata].
-
-[cookiecutter-easydata]: https://github.com/hackalog/cookiecutter-easydata/
\ No newline at end of file
+[easydata issue tracker]: https://github.com/hackalog/easydata/issues
+[easydata]: https://github.com/hackalog/easydata
\ No newline at end of file
diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md
index f975369..270775c 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md	
@@ -79,8 +79,7 @@ output_notebook(resources=INLINE)
 
 # Source module imports
 from {{ cookiecutter.module_name }} import paths
-from {{ cookiecutter.module_name }}.data import DataSource, Dataset
-from {{ cookiecutter.module_name }} import workflow
+from {{ cookiecutter.module_name }}.data import DataSource, Dataset, Catalog
 ```
 You can also find most of these header cells in [00-xyz-sample-notebook.ipynb](../notebooks/00-xyz-sample-notebook.ipynb)
 
@@ -97,8 +96,9 @@ There is a whole world of cell magics. These are bits of code that you can put a
 ### Quick References
 
 * [README](../README.md)
-* [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
+* [Setting up and Maintaining your Conda Environment, Reproducibly](conda-environments.md)
 * [Getting and Using Datasets](datasets.md)
+* [Specifying Paths in Easydata](paths.md)
 * [Using Notebooks for Analysis](notebooks.md)
 * [Sharing your Work](sharing-your-work.md)
 * [Troubleshooting Guide](troubleshooting.md)
diff --git a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml
index c52f026..d0e5cc0 100644
--- a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml	
+++ b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml	
@@ -1,5 +1,13 @@
+{% macro pyver() -%}
+{% if cookiecutter.python_version == 'latest' -%}
+  - python
+{% else -%}
+  - python={{ cookiecutter.python_version }}
+{% endif -%}
+{% endmacro -%}
+name: {{ cookiecutter.repo_name }}
 channels:
-    - defaults
+   - defaults
 dependencies:
-    - python=3.7
-    - pyyaml
+   - pyyaml
+{{ pyver()|indent(3, true) }}
diff --git a/{{ cookiecutter.repo_name }}/scripts/split_pip.py b/{{ cookiecutter.repo_name }}/scripts/split_pip.py
index ecdc987..62d059c 100644
--- a/{{ cookiecutter.repo_name }}/scripts/split_pip.py	
+++ b/{{ cookiecutter.repo_name }}/scripts/split_pip.py	
@@ -2,13 +2,19 @@
 import json
 import sys
 import yaml
+from collections import defaultdict
 
-ACCEPTABLE_FORMATS = ["default", "pip", "pip-yaml", "conda-forge"]
 
-def env_split(conda_env, kind="default"):
-    """Given a conda_environment dict, split into pip/nonpip versions
+def env_split(conda_env, channel_order):
+    """Given a conda_environment dict, and a channel order, split into versions for each channel.
+
+    Returns:
+
+    conda_env: (list)
+       remaining setup bits of the environment.yml file
+    channel_dict: (dict)
+       dict containing the list of dependencies by channel name
 
-    conda_env: dict
         Python object corresponding to environment.yml"""
     # Cheater way to make deep Copies
     json_copy = json.dumps(conda_env)
@@ -17,49 +23,63 @@ def env_split(conda_env, kind="default"):
 
     pipdeps = None
     deplist = conda_env.pop('dependencies')
-    conda_forge_list = []
+    channel_dict = defaultdict(list)
 
     for k, dep in enumerate(deplist[:]):  # Note: copy list, as we mutate it
         if isinstance(dep, dict):  # nested yaml
             if dep.get('pip', None):
-                pipdeps = ["pip", deplist.pop(k)]
+                channel_dict['pip'] = deplist.pop(k)
         else:
-            prefix = 'conda-forge::'
-            if dep.startswith(prefix):
-                conda_forge_list.append(dep[len(prefix):])
+            prefix_check = dep.split('::')
+            if len(prefix_check) > 1:
+                channel = prefix_check[0]
+                if not channel in channel_order:
+                    raise Exception(f'the channel {channel} required for {dep} is not specified in a channel-order section of the environment file')
+                channel_dict[f'{channel}'].append(prefix_check[1])
                 deplist.remove(dep)
 
-    conda_env['dependencies'] = deplist
-    pip_env['dependencies'] = pipdeps
-    return conda_env, pip_env, conda_forge_list
+    channel_dict['defaults'] = deplist
+    conda_env.pop('channel-order', None)
+    return conda_env, channel_dict
+
+def get_channel_order(conda_env):
+    """
+    Given a conda_environment dict, get the channels from the channel order.
+    """
+    channel_order = conda_env.get('channel-order')
+
+    if channel_order is None:
+        channel_order = ['defaults']
+    if not 'defaults' in channel_order:
+        channel_order.insert(0, 'defaults')
+    channel_order.append('pip')
+    return channel_order
 
 def usage():
     print(f"""
-Usage:    split_pip.py [{"|".join(ACCEPTABLE_FORMATS)}] path/to/environment.yml
+Usage:    split_pip.py path/to/environment.yml
     """)
 if __name__ == '__main__':
-    if len(sys.argv) != 3:
-        usage()
-        exit(1)
-
-    kind = sys.argv[1]
-    if kind not in ACCEPTABLE_FORMATS:
+    if len(sys.argv) != 2:
         usage()
         exit(1)
 
-    with open(sys.argv[2], 'r') as yamlfile:
+    with open(sys.argv[1], 'r') as yamlfile:
         conda_env = yaml.safe_load(yamlfile)
 
-    cenv, penv, forgelist = env_split(conda_env)
-    if kind == "pip-yaml":
-        _ = yaml.dump(penv, sys.stdout, allow_unicode=True, default_flow_style=False)
-    elif kind == "pip":
-        print("\n".join(penv["dependencies"].pop(-1)["pip"]))
-    elif kind == "pip-yaml":
-        _ = yaml.dump(penv, sys.stdout, allow_unicode=True, default_flow_style=False)
-    elif kind == "default":
-        _ = yaml.dump(cenv, sys.stdout, allow_unicode=True, default_flow_style=False)
-    elif kind == "conda-forge":
-        print("\n".join(forgelist))
-    else:
-        raise Exception(f"Invalid Kind: {kind}")
+    #check for acceptable formats
+    channel_order = get_channel_order(conda_env)
+    with open('.make.channel-order.include', 'w') as f:
+        f. write(' '.join(channel_order[:-1])) #exclude pip as a channel here
+
+    cenv, channel_dict = env_split(conda_env, channel_order)
+
+    for kind in channel_order:
+        if kind == "pip":
+            filename = '.make.pip-requirements.txt'
+            with open(filename, 'w') as f:
+                f.write("\n".join(channel_dict['pip']['pip']))
+        else:
+            filename = f'.make.{kind}-environment.txt'
+            with open(filename, 'w') as f:
+                f.write("\n".join(channel_dict[kind]))
diff --git a/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py
new file mode 100644
index 0000000..8c41a6b
--- /dev/null
+++ b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py	
@@ -0,0 +1,14 @@
+import sys
+import yaml
+
+
+if __name__ == "__main__":
+    channel_order = ['defaults', 'pytorch']
+    dependency_new = "pytorch::cpuonly"
+
+    with open("environment.yml", "rt", encoding="utf-8") as file_env:
+        env = yaml.safe_load(file_env)
+    env["dependencies"].append(dependency_new)
+    env["channel-order"] = channel_order
+    with open("environment.yml", "wt", encoding="utf-8") as file_env:
+        yaml.safe_dump(env, file_env)
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py
index cd3ea61..872135a 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py	
@@ -14,6 +14,7 @@
     'project_path': '${catalog_path}/..',
     'raw_data_path': '${data_path}/raw',
     'template_path': '${project_path}/reference/templates',
+    'abfs_cache': '${interim_data_path}/abfs_cache',
 }
 _catalog_file = _module_dir.parent / "catalog" / "config.ini"
 
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py
index 1c23d32..b938c9f 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py	
@@ -1,7 +1,7 @@
 from .decorators import SingletonDecorator
 from .kvstore import KVStore
 from .log import logger
-import pathlib
+from pathlib import Path
 
 class PathStore(KVStore):
     """Persistent Key-Value store for project-level paths
@@ -13,15 +13,16 @@ class PathStore(KVStore):
 
     By default, the project directory is the parent of the directory containing the `config_file`:
 
-    >>> b['project_path']
-    PosixPath('/tmpx/project')
-    >>> b['data_path']
-    PosixPath('/tmpx/project/data')
+
+    >>> b['project_path'] == Path('/tmpx/project').resolve()
+    True
+    >>> b['data_path'] == Path('/tmpx/project/data').resolve()
+    True
 
     The `catalog_path` is set upon instantiation and is read-only:
 
-    >>> b['catalog_path']
-    PosixPath('/tmpx/project/catalog')
+    >>> b['catalog_path'] == Path('/tmpx/project/catalog').resolve()
+    True
     >>> b['catalog_path'] = '/tmp'
     Traceback (most recent call last):
      ...
@@ -30,21 +31,21 @@ class PathStore(KVStore):
     Changing a value changes all values that expand to contain it:
 
     >>> b['project_path'] = '/tmpy'
-    >>> b['project_path']
-    PosixPath('/tmpy')
-    >>> b['data_path']
-    PosixPath('/tmpy/data')
+    >>> b['project_path'] ==  Path('/tmpy').resolve()
+    True
+    >>> b['data_path'] == Path('/tmpy/data').resolve()
+    True
 
     We can have multiple levels of expansion:
 
     >>> b['raw_data_path'] = "${data_path}/raw"
-    >>> b['raw_data_path']
-    PosixPath('/tmpy/data/raw')
+    >>> b['raw_data_path'] == Path('/tmpy/data/raw').resolve()
+    True
     >>> b['project_path'] = '/tmp3'
-    >>> b['data_path']
-    PosixPath('/tmp3/data')
-    >>> b['raw_data_path']
-    PosixPath('/tmp3/data/raw')
+    >>> b['data_path'] == Path('/tmp3/data').resolve()
+    True
+    >>> b['raw_data_path'] == Path('/tmp3/data/raw').resolve()
+    True
     """
 
     # These keys should never be written to disk, though they may be used
@@ -58,7 +59,7 @@ def __init__(self, *args,
         if config_file is None:
             self._config_file = "config.ini"
         else:
-            self._config_file = pathlib.Path(config_file)
+            self._config_file = Path(config_file)
         self._usage_warning = False
         super().__init__(*args, config_section=config_section,
                          config_file=self._config_file, **kwargs)
@@ -88,7 +89,7 @@ def __getitem__(self, key):
         if key in self._protected:
             return getattr(self, key)
         self._read()
-        return pathlib.Path(super().__getitem__(key)).resolve()
+        return Path(super().__getitem__(key)).resolve()
 
     @property
     def catalog_path(self):
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py
index 4e7b43e..81d21fc 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py	
@@ -2,4 +2,4 @@
 from .datasets import *
 from .fetch import *
 from .utils import *
-from .extra import *
+from .fileset import *
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py
index 2fa411c..7f44b47 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py	
@@ -88,8 +88,10 @@ def __init__(self,
                  catalog_file='datasets.json',
                  **kwargs):
         """
-        Object representing a dataset object.
-        Notionally compatible with scikit-learn's Bunch object
+        EasyData Dataset container Object.
+
+        Contains metadata (README, LICENSE), associated file list (FILESET), and
+        optionally a data object.
 
         dataset_name: string (required)
             key to use for this dataset
@@ -99,7 +101,7 @@ def __init__(self,
             Either classification target or label to be used. for each of the points
             in `data`
         metadata: dict
-            Data about the object. Key fields include `license_txt`, `descr`, and `hashes`
+            Data about the object. Key fields include `license`, `readme`, and `hashes`
         update_hashes: Boolean
             If True, recompute the data/target hashes in the Metadata
         """
@@ -118,7 +120,7 @@ def __init__(self,
         self['metadata']['dataset_name'] = dataset_name
         self['data'] = data
         self['target'] = target
-        #self['extra'] = Extra.from_dict(metadata.get('extra', None))
+        #self['fileset'] = Fileset.from_dict(metadata.get('fileset', None))
         data_hashes = self._generate_data_hashes()
 
         if update_hashes:
@@ -153,10 +155,10 @@ def __setattr__(self, key, value):
             self['metadata'][key.lower()] = value
         elif key == 'name':
             self['metadata']['dataset_name'] = value
-        elif key in ['extra_base', 'extra_auth_kwargs']:
+        elif key in ['fileset_base', 'fileset_auth']:
             if self.name not in paths._config.sections():
                 paths._config.add_section(self.name)
-            if key == 'extra_auth_kwargs':
+            if key == 'fileset_auth':
                 paths._config.set(self.name, key, json.dumps(value, sort_keys=True))
             else:
                 paths._config.set(self.name, key, value)
@@ -170,7 +172,7 @@ def __delattr__(self, key):
             del self['metadata'][key.lower()]
         elif key == 'name':
             raise ValueError("name is mandatory")
-        elif key == 'extra_base':
+        elif key == 'fileset_base':
             if paths._config.has_section(self.name) and paths._config.has_option(self.name, key):
                 paths._config.remove_option(self.name, key)
                 paths._write()
@@ -226,26 +228,67 @@ def resolve_local_config(self, key, default=None, kind="string"):
             raise ValueError(f"Unknown kind: {kind}")
 
     @property
-    def extra_base(self):
-        return self.resolve_local_config("extra_base", paths['processed_data_path'] / f"{self.name}.extra")
+    def fileset_base(self):
+        return self.resolve_local_config("fileset_base", paths['processed_data_path'] / f"{self.name}.fileset")
 
     @property
-    def extra_auth_kwargs(self):
-        return self.resolve_local_config("extra_auth_kwargs", "{}", kind="json")
+    def fileset_auth(self):
+        return self.resolve_local_config("fileset_auth", "{}", kind="json")
+
+    def filesystem(self):
+        """Return an fsspec filesystem object associated with this fileset_base.
+
+        If present, the kwargs specified in 'Dataset.fileset_auth' will be used to authenticate the connection. These must be valid
+        parameters to 'fsspec.open()'
+
+        returns: fsspec.FileSystem object
+
+        """
+        f = fsspec.open(self.fileset_base, **self.fileset_auth)
+        return f.fs
+
+    def fileset(self, dirs_only=False):
+        """Enumerate contents of fileset.
+
+        Automatically prepends `fileset_base`
+
+        Parameters::
+            dirs_only: Boolean
+                if True, returns only directory names containing files
+                if False, returns files and their associated hashes
+
+                Useful for file formats that are actually directories, like parquet
+
+        Returns:
+            if dirs_only is True:
+                list of directories containing files in the fileset
+            else
+                tuples of filenames, hashlists for every file in the fileset
+        """
+        eb = self.fileset_base
+        sep = "/"
+        ret = []
+        for subdir, filedict in self.FILESET.items():
+            if dirs_only:
+                ret.append(sep.join([eb, subdir]))
+            else: # returns all files
+                for f, hashlist in filedict.items():
+                    ret.append((sep.join([eb, subdir, f]), hashlist))
+        return ret
 
     # Note: won't work because of set/setattr magic above
-    #@extra_base.deleter
-    #def extra_base(self):
-    #    if paths._config.has_section(self.name) and paths._config.has_option(self.name, "extra_base"):
-    #        paths._config.remove_option("extra_base")
+    #@fileset_base.deleter
+    #def fileset_base(self):
+    #    if paths._config.has_section(self.name) and paths._config.has_option(self.name, "fileset_base"):
+    #        paths._config.remove_option("fileset_base")
 
 
     # Note: Won't work because of setattr magic above
-    #@extra_base.setter
-    #def extra_base(self, val):
+    #@fileset_base.setter
+    #def fileset_base(self, val):
     #    if self.name not in paths._config.sections():
     #        paths._config.add_section(self.name)
-    #    paths._config.set(self.name, "extra_base", val)
+    #    paths._config.set(self.name, "fileset_base", val)
     #    paths._write()
     #    logger.debug(f"Writing {paths._config_file}")
 
@@ -579,22 +622,22 @@ def verify_hashes(self, hashdict=None, catalog_path=None):
             hashdict = c[self.name]["hashes"]
         return hashdict.items() <= self.metadata['hashes'].items()
 
-    def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, hash_types=['size']):
+    def verify_fileset(self, fileset_base=None, file_dict=None, return_filelists=False, hash_types=['size']):
         """
-        Verify that all files listed in the metadata EXTRA dict are accessible and have good hashes.
+        Verify that all files listed in the metadata FILESET dict are accessible and have good hashes.
 
         Returns boolean - True if all files are accessible and have good hashes - and optional
         file lists.
 
         Parameters
         ----------
-        extra_base: path or None
-           base for the EXTRA filenames.
+        fileset_base: path or None
+           base for the FILESET filenames.
            if passed as explicit parameter, this location will be used
-           if omitted, the dataset `extra_base` will be read (which checks the local_config,
-           or self.EXTRA_BASE, in that order)
-        file_dict: sub-dict of extra dict
-           if None, default to the whole extra dict
+           if omitted, the dataset `fileset_base` will be read (which checks the local_config,
+           or self.FILESET_BASE, in that order)
+        file_dict: sub-dict of fileset dict
+           if None, default to the whole fileset dict
         return_filelists: boolean, default False
            if True, returns triple (good_hashes, bad_hashes, missing_files)
            else, returns Boolean (all files good)
@@ -617,19 +660,19 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False,
             files that are inaccessible
 
         """
-        if extra_base is None:
-            extra_base = self.extra_base
-        extra_base = pathlib.Path(extra_base)
-        extra_dict = self.metadata.get('extra', None)
+        if fileset_base is None:
+            fileset_base = self.fileset_base
+        fileset_base = pathlib.Path(fileset_base)
+        fileset_dict = self.metadata.get('fileset', None)
         if file_dict is None:
-            file_dict = extra_dict
+            file_dict = fileset_dict
         else:
-            if not (file_dict.keys() <= extra_dict.keys()):
-                raise ValueError(f"file_dict must be a subset of the metadata['extra'] dict")
+            if not (file_dict.keys() <= fileset_dict.keys()):
+                raise ValueError(f"file_dict must be a subset of the metadata['fileset'] dict")
             else:
                 for key in file_dict.keys():
-                    if not (file_dict[key].items() <= extra_dict[key].items()):
-                        raise ValueError(f"file_dict must be a subset of the metadata['extra'] dict")
+                    if not (file_dict[key].items() <= fileset_dict[key].items()):
+                        raise ValueError(f"file_dict must be a subset of the metadata['fileset'] dict")
 
         retval = False
         bad_hash = []
@@ -641,7 +684,7 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False,
         else:
             for directory in file_dict.keys():
                 for file, meta_hash_list in file_dict[directory].items():
-                    path = extra_base / directory / file
+                    path = fileset_base / directory / file
                     rel_path = pathlib.Path(directory) / file
                     if path.exists():
                         disk_hash_list = []
@@ -660,52 +703,52 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False,
         else:
             return retval
 
-    def subselect_extra(self, rel_files):
-        """Convert a (relative) pathname to an EXTRA dict
+    def subselect_fileset(self, rel_files):
+        """Convert a (relative) pathname to an FILESET dict
 
-        Suitable for passing to verify_extra()
+        Suitable for passing to verify_fileset()
         """
-        extra_dict = defaultdict(dict)
+        fileset_dict = defaultdict(dict)
         for rel_file_path in rel_files:
             rel_path = pathlib.Path(rel_file_path)
             try:
-                hashlist = self.EXTRA[str(rel_path.parent)][rel_path.name]
+                hashlist = self.FILESET[str(rel_path.parent)][rel_path.name]
             except KeyError:
-                raise NotFoundError(f"Not in EXTRA: {rel_file_path}") from None
-            extra_dict[str(rel_path.parent)][rel_path.name] = hashlist
-        return dict(extra_dict)
+                raise NotFoundError(f"Not in FILESET: {rel_file_path}") from None
+            fileset_dict[str(rel_path.parent)][rel_path.name] = hashlist
+        return dict(fileset_dict)
 
-    def extra_file(self, relative_path):
-        """Convert a relative path (relative to extra_base) to a fully qualified location
+    def fileset_file(self, relative_path):
+        """Convert a relative path (relative to fileset_base) to a fully qualified location
 
-        extra_base may be prefixed with optional protocol like `s3://` and
+        fileset_base may be prefixed with optional protocol like `s3://` and
         is suitable for passing to fsspec.open_files()
 
         Parameters
         ----------
         relative_path: string or list
-            Relative filepath. Will be appended to extra_base (and an intervening '/' added as needed)
-            extra_base can be prefixed with a protocol like `s3://` to read from alternate filesystems.
+            Relative filepath. Will be appended to fileset_base (and an intervening '/' added as needed)
+            fileset_base can be prefixed with a protocol like `s3://` to read from alternate filesystems.
             To read from multiple files you can pass a globstring or a list of paths, with the caveat
             that they must all have the same protocol.
         """
-        extra_base = self.extra_base
-        if extra_base.startswith("/"):
-            fqpath =  str(pathlib.Path(extra_base) / relative_path)
-        elif extra_base.endswith('/'):
-            fqpath = f"{extra_base}{relative_path}"
+        fileset_base = self.fileset_base
+        if fileset_base.startswith("/"):
+            fqpath =  str(pathlib.Path(fileset_base) / relative_path)
+        elif fileset_base.endswith('/'):
+            fqpath = f"{fileset_base}{relative_path}"
         else:
-            fqpath = f"{extra_base}/{relative_path}"
+            fqpath = f"{fileset_base}/{relative_path}"
         return fqpath
 
-    def open_extra(self, relative_path, auth_kwargs=None, **kwargs):
-        """Given a path (relative to extra_base), return an fsspec.OpenFile object
+    def open_fileset(self, relative_path, auth_kwargs=None, **kwargs):
+        """Given a path (relative to fileset_base), return an fsspec.OpenFile object
 
         Parameters
         ----------
         relative_path: string or list
-            Relative filepath. Will be appended to extra_base (and an intervening '/' added as needed)
-            extra_base can be prefixed with a protocol like `s3://` to read from alternate filesystems.
+            Relative filepath. Will be appended to fileset_base (and an intervening '/' added as needed)
+            fileset_base can be prefixed with a protocol like `s3://` to read from alternate filesystems.
             To read from multiple files you can pass a globstring or a list of paths, with the caveat
             that they must all have the same protocol.
         auth_kwargs: dict or None
@@ -717,7 +760,7 @@ def open_extra(self, relative_path, auth_kwargs=None, **kwargs):
 
         Examples
         --------
-        >>> with ds.open_extra('2020-01-*.csv') as f:
+        >>> with ds.open_fileset('2020-01-*.csv') as f:
         ...    df = pd.read_csv(f)   # doctest: +SKIP
 
         Returns
@@ -726,11 +769,11 @@ def open_extra(self, relative_path, auth_kwargs=None, **kwargs):
         be used as a single context
         """
         if auth_kwargs is None:
-            auth_kwargs = self.extra_auth_kwargs
+            auth_kwargs = self.fileset_auth
         if auth_kwargs:
             logger.debug(f"Passing authentication information via auth_kwargs")
 
-        return fsspec.open(self.extra_file(relative_path), **auth_kwargs, **kwargs)
+        return fsspec.open(self.fileset_file(relative_path), **auth_kwargs, **kwargs)
 
     def dump(self, file_base=None, dump_path=None, hash_type='sha1',
              exists_ok=False, create_dirs=True, dump_metadata=True, update_catalog=True,
@@ -867,8 +910,8 @@ def __init__(self,
                     Value of hash used to verify file integrity
                 file_name: string (optional)
                     filename to use when saving file locally. If omitted, it will be inferred from url or source_file
-                name: string or {'DESCR', 'LICENSE'} (optional)
-                    description of the file. of DESCR or LICENSE, will be used as metadata
+                name: string or {'README', 'LICENSE'} (optional)
+                    description of the file. of README or LICENSE, will be used as metadata
                 unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
                     action to take in order to unpack this file. If None, infers from file type.
 
@@ -909,14 +952,14 @@ def file_list(self):
         logger.warning("file_list is deprecated. Use file_dict instead")
         return list(self.file_dict.values())
 
-    def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='DESCR', unpack_action='copy', force=False):
+    def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='README', unpack_action='copy', force=False):
         """Add metadata to a DataSource
 
         filename: create metadata entry from contents of this file. Relative to `metadata_path`
         contents: create metadata entry from this string
         metadata_path: (default `paths['raw_data_path']`)
             where to store metadata files
-        kind: {'DESCR', 'LICENSE'}
+        kind: {'README', 'LICENSE'}
         unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
             action to take in order to unpack this file. If None, infers from file type.
         force: boolean (default False)
@@ -928,7 +971,7 @@ def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='D
             metadata_path = pathlib.Path(metadata_path)
 
         filename_map = {
-            'DESCR': f'{self.name}.readme',
+            'README': f'{self.name}.readme',
             'LICENSE': f'{self.name}.license',
         }
         if kind not in filename_map:
@@ -1337,7 +1380,7 @@ def process(self,
         return_X_y: boolean
             if True, returns (data, target) instead of a `Dataset` object.
         use_docstring: boolean
-            If True, the docstring of `self.process_function` is used as the Dataset DESCR text.
+            If True, the docstring of `self.process_function` is used as the Dataset README text.
         """
         if not self.unpacked_:
             logger.debug("process() called before unpack()")
@@ -1373,13 +1416,13 @@ def process(self,
     def default_metadata(self, use_docstring=False):
         """Returns default metadata derived from this DataSource
 
-        This sets the dataset_name, and fills in `license` and `descr`
+        This sets the dataset_name, and fills in `license` and `readme`
         fields if they are present, either on disk, or in the file list
 
         Parameters
         ----------
         use_docstring: boolean
-            If True, the docstring of `self.process_function` is used as the Dataset DESCR text.
+            If True, the docstring of `self.process_function` is used as the Dataset README text.
 
         Returns
         -------
@@ -1388,12 +1431,12 @@ def default_metadata(self, use_docstring=False):
 
         metadata = {}
         optmap = {
-            'DESCR': 'descr',
+            'README': 'readme',
             'LICENSE': 'license',
         }
         filemap = {
             'license': f'{self.name}.license',
-            'descr': f'{self.name}.readme'
+            'readme': f'{self.name}.readme'
         }
 
         for key, fetch_dict in self.file_dict.items():
@@ -1406,7 +1449,7 @@ def default_metadata(self, use_docstring=False):
         if use_docstring:
             func = partial(self.process_function)
             fqfunc, invocation =  partial_call_signature(func)
-            metadata['descr'] =  f'Data processed by: {fqfunc}\n\n>>> ' + \
+            metadata['readme'] =  f'Data processed by: {fqfunc}\n\n>>> ' + \
               f'{invocation}\n\n>>> help({func.func.__name__})\n\n' + \
               f'{func.func.__doc__}'
 
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py
similarity index 59%
rename from {{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py
rename to {{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py
index 74419cb..f69aced 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py	
@@ -1,5 +1,5 @@
 """
-Functions for handling "extra" data; i.e.  collections of raw files associated with a Dataset
+Functions for handling "fileset" data; i.e.  collections of raw files associated with a Dataset
 """
 
 from collections import defaultdict
@@ -13,13 +13,13 @@
 from ..log import logger
 
 __all__ = [
-    'process_extra_files',
+    'process_fileset_files',
 ]
 
-def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", extra_dir=".extra", dataset_dir=None, do_copy=False):
+def process_fileset_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", fileset_dir=".fileset", dataset_dir=None, do_copy=False):
     """
     Process unpacked raw files into its minimal dataset components (data, target, metadata).
-    Here, 'minimal' means `data` and `target` will be None, and `extra` will contain a
+    Here, 'minimal' means `data` and `target` will be None, and `fileset` will contain a
     file dict of files matching the specified file_glob (and their sizes).
 
     Parameters
@@ -32,11 +32,11 @@ def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, fil
         Name of the directory of the unpacked zip file containing the raw data files.
         relative to unpack_dir
     file_glob: string
-        Add only files matching this glob pattern to EXTRA
-    extra_dir: string
+        Add only files matching this glob pattern to FILESET
+    fileset_dir: string
         Used in building the file_dict keys.
     do_copy: boolean
-        if True, actually copy the files. Otherwise just build EXTRA
+        if True, actually copy the files. Otherwise just build FILESET
 
     Returns
     -------
@@ -47,7 +47,7 @@ def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, fil
     data and target are None,
 
     metadata contains a file dict; i.e.
-    'extra': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...}
+    'fileset': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...}
     """
     if metadata is None:
         metadata = {}
@@ -63,14 +63,14 @@ def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, fil
     if extract_dir is not None:
         unpack_dir /= extract_dir
 
-    extra_dir = pathlib.Path(extra_dir)
-    extra_dir_fq = dataset_dir / extra_dir
+    fileset_dir = pathlib.Path(fileset_dir)
+    fileset_dir_fq = dataset_dir / fileset_dir
     logger.debug(f"Do copy: {do_copy}")
     if do_copy:
-        if extra_dir_fq.is_dir():
-            logger.warning(f"Cleaning contents of {extra_dir}")
-            shutil.rmtree(extra_dir_fq)
-            logger.debug(f"Copying files to {extra_dir_fq}...")
+        if fileset_dir_fq.is_dir():
+            logger.warning(f"Cleaning contents of {fileset_dir}")
+            shutil.rmtree(fileset_dir_fq)
+            logger.debug(f"Copying files to {fileset_dir_fq}...")
 
     file_dict = defaultdict(dict)
     files = sorted(list(unpack_dir.rglob(file_glob)))
@@ -78,11 +78,11 @@ def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, fil
         if file.is_dir():
             continue
         relative_path = file.relative_to(unpack_dir)
-        extra_path = extra_dir / relative_path
-        file_dict[str(extra_path.parent)][str(extra_path.name)] = [f'size:{os.path.getsize(file)}']
+        fileset_path = fileset_dir / relative_path
+        file_dict[str(fileset_path.parent)][str(fileset_path.name)] = [f'size:{os.path.getsize(file)}']
         if do_copy:
-            os.makedirs(dataset_dir / extra_path.parent, exist_ok=True)
-            shutil.copyfile(file, dataset_dir / extra_path)
-    metadata['extra'] = dict(file_dict)
+            os.makedirs(dataset_dir / fileset_path.parent, exist_ok=True)
+            shutil.copyfile(file, dataset_dir / fileset_path)
+    metadata['fileset'] = dict(file_dict)
 
     return None, None, metadata
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py
index 6054735..31cbb1e 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py	
@@ -3,6 +3,7 @@
 """
 
 import pathlib
+from sklearn.datasets import fetch_20newsgroups
 
 from tqdm.auto import tqdm
 
@@ -10,4 +11,41 @@
 from ..log import logger
 
 __all__ = [
+    'process_20_newsgroups'
 ]
+
+def process_20_newsgroups(*, extract_dir='20_newsgroups',
+                          metadata=None, unpack_dir=None,
+                          opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}):
+    """
+    Process 20 newsgroups into (data, target, metadata) format.
+
+
+    Parameters
+    ----------
+    unpack_dir: path
+        The interim parent directory the dataset files have been unpacked into.
+    extract_dir: str
+        Name of the directory of the unpacked files relative to the unpack_dir. Note that
+    opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"}
+        Options to pass to sklearn.datasets.fetch_20newsgroups.
+
+
+    Returns
+    -------
+    A tuple:
+        (data, target, additional_metadata)
+
+    """
+    if metadata is None:
+        metadata = {}
+
+    if unpack_dir is None:
+        unpack_dir = paths['interim_data_path']
+    else:
+        unpack_dir = pathlib.Path(unpack_dir)
+    data_dir = unpack_dir / f"{extract_dir}"
+
+    news = fetch_20newsgroups(**opts)
+
+    return news.data, news.target, metadata
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py
index 615a3bc..7cdf6ad 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py	
@@ -12,10 +12,11 @@
 from ..utils import run_notebook
 
 __all__ = [
-    'run_notebook_transformer',
     'apply_single_function',
+    'copy_dataset',
     'csv_to_pandas',
     'new_dataset',
+    'run_notebook_transformer',
     'sklearn_train_test_split',
     'sklearn_transform',
 ]
@@ -163,23 +164,21 @@ def csv_to_pandas(ds_dict, *, output_map, **opts):
     new_ds = {}
     df = None
     for ds_name, dset in ds_dict.items():
-        extra = dset.metadata.get('extra', None)
-        if extra is not None:
-            logger.debug(f"Input dataset {ds_name} has extra data. Processing...")
-            for rel_dir, file_dict in extra.items():
+        fileset = dset.metadata.get('fileset', None)
+        if fileset is not None:
+            logger.debug(f"Input dataset {ds_name} has fileset data. Processing...")
+            for rel_dir, file_dict in fileset.items():
                 for new_dsname, csv_filename in output_map.items():
                     if csv_filename in file_dict:
                         logger.debug(f"Found {csv_filename}. Creating {new_dsname} dataset")
                         path = paths['processed_data_path'] / rel_dir / csv_filename
                         df = pd.read_csv(path)
                         new_metadata = dset.metadata
-                        new_metadata.pop('extra', None)
+                        new_metadata.pop('fileset', None)
                         new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=df, metadata=new_metadata)
     return new_ds
 
-
-
-def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_descr_txt, drop_extra, **opts):
+def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_readme_txt, drop_fileset, **opts):
     """
     Parameters
     ----------
@@ -189,12 +188,12 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali
         name of the dataset that the new dataset will be derived from
     dataset_name:
         name of the new dataset_catalog
-    added_descr_txt: Default None
-        new description text to be appended to the metadata descr
+    added_readme_txt: Default None
+        new description text to be appended to the metadata readme
     serialized_function:
         function (serialized by src.utils.serialize_partial) to run on .data to produce the new .data
-    drop_extra: boolean
-        drop the .extra part of the metadata
+    drop_fileset: boolean
+        drop the .fileset part of the metadata
     **opts:
         Remaining options will be ignored
     """
@@ -205,10 +204,10 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali
     ds = ds_dict.get(source_dataset_name)
 
     new_metadata = ds.metadata.copy()
-    new_metadata['descr'] += added_descr_txt
-    if drop_extra:
-        if new_metadata.get('extra', 0) != 0:
-            new_metadata.pop('extra')
+    new_metadata['readme'] += added_readme_txt
+    if drop_fileset:
+        if new_metadata.get('fileset', 0) != 0:
+            new_metadata.pop('fileset')
 
     logger.debug(f"Applying data function...")
     data_function=deserialize_partial(serialized_function)
@@ -222,8 +221,45 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali
     new_ds[dataset_name] = Dataset(dataset_name=dataset_name, data=new_data, target=new_target, metadata=new_metadata)
     return new_ds
 
+def copy_dataset(ds_dict, *, source_dataset_name, dataset_name, added_readme_txt, drop_fileset=True, **opts):
+    """
+    Create a new dataset by copying an existing one
+    Parameters
+    ----------
+    ds_dict:
+        input datasets.
+    source_dataset_name:
+        name of the dataset that the new dataset will be derived from
+    dataset_name:
+        name of the new dataset_catalog
+    added_readme_txt: Default None
+        new description text to be appended to the metadata readme
+    drop_fileset: boolean
+        drop the .fileset part of the metadata
+    **opts:
+        Remaining options will be ignored
+    """
+
+    new_ds = {}
+
+    logger.debug(f"Loading {source_dataset_name}...")
+    ds = ds_dict.get(source_dataset_name)
 
     new_metadata = ds.metadata.copy()
+    new_metadata['readme'] += added_readme_txt
+    if drop_fileset:
+        if new_metadata.get('fileset', 0) != 0:
+            new_metadata.pop('fileset')
 
-    new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=preprocessed_corpus, metadata=new_metadata)
+    if drop_data:
+        new_data = None
+    else:
+        new_data = ds.data.copy()
+
+    if drop_target:
+        new_target = None
+    else:
+        new_target = ds.target.copy()
+
+    new_ds[dataset_name] = Dataset(dataset_name=dataset_name, data=new_data, target=new_target, metadata=new_metadata)
     return new_ds
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py
index 1283bab..186704c 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py	
@@ -1,7 +1,9 @@
 ## Script common ways of adding a dataset to the workflow
 
 from functools import partial
+import fsspec
 import pathlib
+import os
 
 from .log import logger
 from . import paths
@@ -10,14 +12,17 @@
 from .data import (DataSource, Dataset, hash_file, DatasetGraph, Catalog,
                serialize_transformer_pipeline)
 from .data.transformer_functions import csv_to_pandas, new_dataset, apply_single_function, run_notebook_transformer
-from .data.extra import process_extra_files
+from .data.fileset import process_fileset_files
 from .data.utils import serialize_partial
 
 __all__ = [
-    'notebook_as_transformer',
     'dataset_from_csv_manual_download',
+    'dataset_from_fsurl',
     'dataset_from_metadata',
     'dataset_from_single_function',
+    'derived_dataset',
+    'metadata_from_fsspec',
+    'notebook_as_transformer',
 ]
 
 
@@ -90,7 +95,7 @@ def notebook_as_transformer(notebook_name, *,
 
 # Create a Dataset from a single csv file
 def dataset_from_csv_manual_download(ds_name, csv_path, download_message,
-                                     license_str, descr_str, *, hash_type='sha1',
+                                     license_str, readme_str, *, hash_type='sha1',
                                      hash_value=None,
                                      overwrite_catalog=False,):
     """
@@ -107,7 +112,7 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message,
         Hash, computed via the algorithm specified in `hash_type`
     license_str: str
         Contents of metadata license as text
-    descr_str:
+    readme_str:
         Contents of the metadata description as text
     overwrite_catalog: boolean
         If True, existing entries in datasets and transformers catalogs will be
@@ -136,14 +141,14 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message,
                              hash_value=hash_value,
                              unpack_action='copy',
                              force=True)
-    dsrc.add_metadata(contents=descr_str, force=True)
+    dsrc.add_metadata(contents=readme_str, force=True)
     dsrc.add_metadata(contents=license_str, kind='LICENSE', force=True)
 
-    process_function = process_extra_files
-    process_function = process_extra_files
+    process_function = process_fileset_files
+    process_function = process_fileset_files
     process_function_kwargs = {'do_copy':True,
                                'file_glob':str(csv_path.name),
-                               'extra_dir': raw_ds_name+'.extra',
+                               'fileset_dir': raw_ds_name+'.fileset',
                                'extract_dir': raw_ds_name}
     dsrc.process_function = partial(process_function, **process_function_kwargs)
     datasource_catalog = Catalog.load('datasources')
@@ -202,7 +207,7 @@ def dataset_from_metadata(dataset_name, metadata=None, overwrite_catalog=False):
     return ds
 
 
-def dataset_from_single_function(*, source_dataset_name, dataset_name, data_function, added_descr_txt, drop_extra=True, overwrite_catalog=False):
+def dataset_from_single_function(*, source_dataset_name, dataset_name, data_function, added_readme_txt, drop_fileset=True, overwrite_catalog=False):
     """
     Create a derived dataset (dataset_name) via a single function call on .data from a
     previous dataset (source_dataset_name).
@@ -213,8 +218,8 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func
         name of the dataset that the new dataset will be derived from
     dataset_name:
         name of the new dataset_catalog
-    added_descr_txt: Default None
-        new description text to be appended to the metadata descr
+    added_readme_txt: Default None
+        new description text to be appended to the metadata readme
     data_function:
         function (from src module) to run on .data to produce the new .data
     overwrite_catalog: boolean
@@ -223,7 +228,43 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func
     dag = DatasetGraph(catalog_path=paths['catalog_path'])
     serialized_function = serialize_partial(data_function)
     transformers = [partial(apply_single_function, source_dataset_name=source_dataset_name, dataset_name=dataset_name,
-                            serialized_function=serialized_function, added_descr_txt=added_descr_txt, drop_extra=drop_extra)]
+                            serialized_function=serialized_function, added_readme_txt=added_readme_txt, drop_fileset=drop_fileset)]
+    dag.add_edge(input_dataset=source_dataset_name,
+                 output_dataset=dataset_name,
+                 transformer_pipeline=serialize_transformer_pipeline(transformers),
+                 overwrite_catalog=overwrite_catalog)
+    ds = Dataset.from_catalog(dataset_name)
+    logger.debug(f"{dataset_name} added to catalog")
+    return ds
+
+def derived_dataset(*, dataset_name, source_dataset_name, added_readme_txt,
+                    drop_fileset=True, drop_data=True, drop_target=False,
+                    overwrite_catalog=False):
+    """
+    Create a derived dataset (dataset_name) via a single function call on .data from a
+    previous dataset (source_dataset_name).
+
+    Parameters
+    ----------
+    source_dataset_name:
+        name of the dataset that the new dataset will be derived from
+    dataset_name:
+        name of the new dataset_catalog
+    added_readme_txt: Default None
+        new description text to be appended to the metadata readme
+    drop_fileset: boolean
+        If True, don't copy fileset data to new dataset
+    drop_data: boolean
+        If True, don't copy data to new dataset
+    drop_target: boolean
+        If True, don't copy target to new dataset
+    overwrite_catalog: boolean
+        if True, existing entries in datasets and transformers catalogs will be overwritten
+    """
+    dag = DatasetGraph(catalog_path=paths['catalog_path'])
+    serialized_function = serialize_partial(data_function)
+    transformers = [partial(copy_dataset, source_dataset_name=source_dataset_name, dataset_name=dataset_name,
+                            added_readme_txt=added_readme_txt, drop_fileset=drop_fileset, drop_data=drop_data, drop_target=drop_target)]
     dag.add_edge(input_dataset=source_dataset_name,
                  output_dataset=dataset_name,
                  transformer_pipeline=serialize_transformer_pipeline(transformers),
@@ -231,3 +272,159 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func
     ds = Dataset.from_catalog(dataset_name)
     logger.debug(f"{dataset_name} added to catalog")
     return ds
+
+def metadata_from_fsspec(fs, path, metadata=None, fileset=None):
+    """Create metadata, FILESET file list from fsspec URL.
+
+    Creates a metadata dict representing a dataset
+
+    + filenames in all uppercase are assumed to be metadata fields
+    + remaining files are used to populate FILESET data and have their hashes computed.
+
+    Parameters
+    ----------
+    fs:
+        fsspec.filesystem instance (already connected)
+    path:
+        relative to fs
+    metadata:
+        current contents of metadata dict.
+        Metadata obtained from fsurl will overwrite any similarly named fields in this dict
+    fileset:
+        Current contents of FILESET. new data will be appended.
+        Similarly named entries will be overwritten.
+
+    returns metadata dict
+    """
+    # There's a chance this should get rewritten to use 'fsspec.walk'
+
+    if metadata is None:
+        metadata = {}
+    if fileset is None:
+        fileset = metadata.get('fileset', {})
+    protocol = fs.protocol
+    dirs_done = []
+    dirs = [path]
+
+    while dirs:
+        dirname = dirs.pop()
+        rel_dirname = os.path.relpath(dirname, start=path)
+        dirs_done.append(dirname)
+        for file_info in fs.ls(dirname, detail=True):
+            file_type = file_info.get('type', None)
+            file_name = file_info['name']
+            if file_type == 'directory':
+                dirs.append(file_name)
+            elif file_type == 'file':
+                basename = os.path.basename(os.path.normpath(file_name))
+                if str.isupper(basename):
+                    # Add to metadata
+                    with fs.open(file_name, 'r') as fr:
+                        contents = '\n'.join(fr.readlines())
+                    metadata[str.lower(basename)] = contents
+                else:
+                    # add file and hash to FILESET
+                    if protocol == "abfs":
+                        # Cheap way to get md5
+                        md5_arr = file_info['content_settings']['content_md5']
+                        hashval = f"md5:{''.join('{:02x}'.format(x) for x in md5_arr)}"
+                    else:
+                        logger.warning(f"Unsupported fsspec filesystem: {fs.protocol}. Using size as hash")
+                        hashval = f"size:{fs.size(file_name)}"
+                    rel_path = os.path.relpath(file_info['name'], start=dirname) or "."
+                    # fileset[rel_dirname][rel_path] = [hashval]
+                    entry = {rel_path:[hashval]}
+                    fileset.setdefault(rel_dirname,{}).update(entry)
+            else:
+                raise Exception(f"Unknown file type: {file_type}")
+    metadata["fileset"] = fileset
+    return metadata
+
+
+
+def dataset_from_fsurl(fsurl,
+                       dataset_name=None,
+                       fsspec_auth=None,
+                       metadata=None,
+                       fileset=None,
+                       overwrite_catalog=True):
+    """Create a dataset from the contents of an fsspec URL
+
+    'fsurl' is assumed to be a directory/container/bucket.
+
+    Files in this bucket with names entirely in UPPERCASE are assumed
+    to be textfiles and are used to populate metadata fields directly
+    as metadata fields (e.g. README, LICENSE)
+
+    Other files have their hashes added to FILESET, and are included in
+    the FileSet (FILESET data) associated with the dataset.
+
+    Parameters::
+
+    fsurl: fsspec URL
+        Should be a "directory", container, or "subdirectory" of said container.
+    dataset_name: string or None
+        Name to use for Dataset.
+        if None, name is the last component of the fsurl path
+    metadata:
+        current contents of metadata dict.
+        Metadata obtained from fsurl will overwrite any similarly named fields in this dict
+    fileset:
+        Current contents of FILESET. new data will be appended.
+        Similarly named entries will be overwritten.
+    overwrite_catalog: Boolean
+        if True, entry in Dataset catalog will be overwritten with the newly generated Dataset
+
+    Returns::
+    Dataset containing only metadata and FILESET info for all files in the specified fsspec URL.
+
+    """
+    if fsspec_auth is None:
+        fsspec_auth = {}
+
+    f = fsspec.open(fsurl, **fsspec_auth)
+    path = f.path
+    if dataset_name is None:
+        dataset_name = os.path.basename(os.path.normpath(path))
+        logger.debug(f"Inferring dataset_name from fsurl: {dataset_name}")
+    fs = f.fs
+    protocol = fs.protocol
+    meta = metadata_from_fsspec(fs, path, metadata=metadata, fileset=fileset)
+    meta['fileset_base'] = fsurl
+    ds = dataset_from_metadata(dataset_name,
+                               metadata=meta,
+                               overwrite_catalog=overwrite_catalog)
+    return ds
+
+def derived_dataset(*, dataset_name, source_dataset, added_readme_txt=None, drop_fileset=True, data=None, target=None):
+    """Create a dataset by copying its metadata from another dataset
+
+    Parameters
+    ----------
+    added_readme_txt: string
+        String to be appended to the end of the new dataset's README metadata
+    drop_fileset: boolean
+        if True, ignore fileset when copying metadata
+    data:
+        Will be used as contents of new dataset's `data`
+    target:
+        Will be used as contents of new dataset's `target`
+    dataset_name: String
+        new dataset name
+    source_dataset: Dataset
+        Metadata will be copied from this dataset
+
+    Returns
+    -------
+    new (derived) Dataset object
+    """
+    new_metadata = ds.metadata.copy()
+    if added_readme_txt:
+        new_metadata['readme'] += added_readme_txt
+    if drop_fileset:
+        if new_metadata.get('fileset', 0) != 0:
+            new_metadata.pop('fileset')
+    if new_metadata.get('hashes', 0) != 0:
+            new_metadata.pop('hashes')
+    ds_out = Dataset(dataset_name, metadata=new_metadata, data=data, target=target, **kwargs)
+    return ds_out
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py
index 056f497..31f55a8 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py	
@@ -2,15 +2,12 @@
 from functools import partial
 
 from {{ cookiecutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog
-from {{ cookiecutter.module_name }} import workflow, paths
+from {{ cookiecutter.module_name }}.data.process_functions import process_20_newsgroups
+from {{ cookiecutter.module_name }} import paths
 from {{ cookiecutter.module_name }}.log import logger
 
 # Set up a 20 newsgroups dataset
 
-ds_name = '20_newsgroups'
-output_ds_name = ds_name
-dsrc = DataSource(ds_name)
-
 license = """
 Custom Academic License: "You may use this material free of charge for any educational purpose, provided attribution is given in any lectures or publications that make use of this material." As in http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.data.html.
 """
@@ -46,51 +43,19 @@
 
 By default we follow the sklearn suggestion to set `remove=('headers', 'footers', 'quotes')` to avoid overfitting.
 """
+if __name__ =='__main__':
+    ds_name = '20_newsgroups'
+    output_ds_name = ds_name
+    dsrc = DataSource(ds_name)
 
-dsrc.add_metadata(contents=metadata, force=True)
-dsrc.add_metadata(contents=license, kind='LICENSE', force=True)
-
-def process_20_newsgroups(*, extract_dir='20_newsgroups',
-                          metadata=None, unpack_dir=None,
-                          opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}):
-    """
-    Process 20 newsgroups into (data, target, metadata) format.
-
-
-    Parameters
-    ----------
-    unpack_dir: path
-        The interim parent directory the dataset files have been unpacked into.
-    extract_dir: str
-        Name of the directory of the unpacked files relative to the unpack_dir. Note that
-    opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"}
-        Options to pass to sklearn.datasets.fetch_20newsgroups.
-
-
-    Returns
-    -------
-    A tuple:
-        (data, target, additional_metadata)
-
-    """
-    if metadata is None:
-        metadata = {}
-
-    if unpack_dir is None:
-        unpack_dir = paths['interim_data_path']
-    else:
-        unpack_dir = pathlib.Path(unpack_dir)
-    data_dir = unpack_dir / f"{extract_dir}"
-
-    news = fetch_20newsgroups(**opts)
-
-    return news.data, news.target, metadata
+    dsrc.add_metadata(contents=metadata, force=True)
+    dsrc.add_metadata(contents=license, kind='LICENSE', force=True)
 
-process_function = process_20_newsgroups
-process_kwargs = {}
+    process_function = process_20_newsgroups
+    process_kwargs = {}
 
-dsrc.process_function = partial(process_function, **process_kwargs)
-dsrc.update_catalog()
+    dsrc.process_function = partial(process_function, **process_kwargs)
+    dsrc.update_catalog()
 
-dag = DatasetGraph()
-dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True)
+    dag = DatasetGraph()
+    dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True)
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py
index df901a9..03c6d44 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py	
@@ -1,6 +1,11 @@
-# Workflow is where we patch around API issues in between releases.
-# Nothing in this file is intended to be a stable API. use at your own risk,
-# as its contents will be regularly deprecated
+"""A module where we temporarily smooth our way around API issues in Easydata.
+
+This is a place where we temporarily address UX and API issues in Easydata, usually by writing convenient wrappers around existing functionality.
+
+Nothing in here is intended to be a stable API, so use at your own risk, as these contents are regularly deprecated.
+
+"""
+
 import sys
 import logging
 from .data import Catalog, Dataset, DataSource