From db25e0e1387e902a6559f41c439c8cad1c385778 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 15:52:13 -0800
Subject: [PATCH 01/36] update with latest changes

---
 .../reference/easydata/conda-environments.md  | 36 ++++++++++++++-----
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md
index e698b52..724d131 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md	
@@ -4,13 +4,19 @@ The `{{ cookiecutter.repo_name }}` repo is set up with template code to make man
 
 If you haven't yet, configure your conda environment.
 
+**WARNING**: If you have conda-forge listed as a channel in your `.condarc` (or any other channels other than defaults), you may experience great difficulty generating reproducible conda environments.
+
+We recommend you remove conda-forge (and all other non-default channels) from your `.condarc` file and [set your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html). You can still use conda-forge (or any other conda channel), just specify it explicitly in your `environment.yml` by prefixing your package name with `channel-name::`; e.g.
+```
+  - wheel                    # install from the default (anaconda) channel
+  - pytorch::pytorch         # install this from the `pytorch` channel
+  - conda-forge::tokenizers  # install this from conda-forge
+```
+
 ## Configuring your python environment
 Easydata uses conda to manage python packages installed by both conda **and pip**.
 
 ### Adjust your `.condarc`
-**WARNING FOR EXISTING CONDA USERS**: If you have `conda-forge` listed as a channel in your `.condarc` (or any other channels other than `default`), **remove them**. These channels should be specified in `environment.yml` instead.
-
-We also recommend [setting your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html) to reduce package incompatibility problems. This will be the default in conda 5.0, but in order to assure reproducibility, we need to use this behavior now.
 
 ```
 conda config --set channel_priority strict
@@ -26,18 +32,30 @@ conda config --prepend channels defaults
 conda config --prepend envs_dirs ~/.conda/envs   # Store environments in local dir for JupyterHub
 ```
 
-### Fix the CONDA_EXE path
-* Make note of the path to your conda binary:
+#### Locating the `conda` binary
+Ensure the Makefile can find your conda binary, either by setting the `CONDA_EXE` environment variable, or by modifying `Makefile.include` directly.
+
+First, check if `CONDA_EXE` is already set
 ```
-   $ which conda
+   >>> export | grep CONDA_EXE
+   CONDA_EXE=/Users/your_username/miniconda3/bin/conda
+```
+
+If `CONDA_EXE` is not set, you will need to set it manually in `Makefile.include`; i.e.
+
+* Make note of the path to your conda binary. It should be in the `bin` subdirectory of your Anaconda (or miniconda) installation directory:
+```
+   >>>  which conda         # this will only work if conda is in your PATH, otherwise, verify manually
    ~/miniconda3/bin/conda
 ```
-* ensure your `CONDA_EXE` environment variable is set correctly in `Makefile.include`
+* ensure your `CONDA_EXE` environment variable is set to this value; i.e.
 ```
-    export CONDA_EXE=~/miniconda3/bin/conda
+    >>> export CONDA_EXE=~/miniconda3/bin/conda
 ```
+or edit `Makefile.include` directly.
+
 ### Create the conda environment
-* Create and switch to the virtual environment:
+Create and switch to the virtual environment:
 ```
 cd {{ cookiecutter.repo_name }}
 make create_environment

From 82239965fc9cbd7a432b8454f0e45143581ceef7 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 16:19:31 -0800
Subject: [PATCH 02/36] clean up

---
 .../reference/easydata/datasets.md            | 29 ++++++++++---------
 .../reference/easydata/git-workflow.md        | 16 +++++-----
 .../reference/easydata/notebooks.md           |  4 +--
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md
index 5b54c1e..2cd4687 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md	
@@ -3,8 +3,8 @@
 ## TL;DR
 To get started, all you really need to know is that you can query for available datasets via
 ```python
-from {{ cookiecutter.module_name }} import workflow
-workflow.dataset_catalog()
+from {{ cookiecutter.module_name }}.data import Catalog
+Catalog.load("datasets")
 ```
 
 and load these datasets via
@@ -15,15 +15,18 @@ ds = Dataset.load(dataset_name)
 
 If you've followed the instructions from building the repo contained in the [README](../README.md), this should just work (if it doesn't, please let us know)!
 
-You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.DESCR` and `ds.LICENSE`.
+You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.README` and `ds.LICENSE`.
 
-**Warning**: some of the datasets can be quite large. If you want to store your data externally, we recommend symlinking your data directory (that is the `{{ cookiecutter.repo_name }}/data` directory) to somewhere with more room before loading your first `Dataset`.
+**Disk Space Note**: sometimes datasets can be quite large. If you want to store your data externally, we recommend pointing your data directory to a new location; that is,
 
+```python
+from {{ cookiecutter.module_name }} import paths
+paths["data_path"] = "/path/to/big/data/directory"
+```
 
 ## Digging Deeper
 It is useful to know a little bit more about how Datasets work.
 
-
 ## What is a `Dataset` object?
 
 A Dataset is the fundamental object we use for turning raw data into useful datasets, reproducibly. It is like a scikit-learn-style `Bunch` object --- essentially, a dictionary with some extra magic to make it nicer to work with --- containing the following attributes:
@@ -36,7 +39,7 @@ A Dataset is the fundamental object we use for turning raw data into useful data
 
 The `data` attribute can really be any processed data form that you like: sometimes it's a pandas dataframe (like with `wine_reviews_130k`), a list of tuples containing other data, (`reddit_comment_tree_graphs`), or other formats including  `scipy.sparse` matrices or `igraph` graphs. The `target` (if you're using it), expects something that matches the `data` in terms of length.
 
-For a hint as to which data format to expect, you can look at the contents of the `DESCR` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object.
+For a hint as to which data format to expect, you can look at the contents of the `README` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object.
 
 This `metadata` is where things get interesting... which we'll cover on its own next.
 
@@ -44,9 +47,9 @@ This `metadata` is where things get interesting... which we'll cover on its own
 The `metadata` is where the magic lives. It serves several purposes in terms of bookkeeping:
 
 * it includes `HASHES`, which **improve data reproducibility**, since what you download and process gets checked each step along the way to ensure the raw data matches what is stored in the `dataset_catalog`,
-* it provides easy access to **what the data is** via the `DESCR` attribute,
+* it provides easy access to **what the data is** via the `README` attribute,
 * it provides easy (and continual) **access to the license / usage restrictions** for the data (the `LICENSE` attribute), which helps with knowing what you can do when [Sharing your Work](sharing-your-work.md).
-* it provides the **extra data manifest**, `EXTRA`, if your dataset includes around additional raw data (extra) files.
+* it provides the **fileset data manifest**, `FILESET`, if your dataset includes around additional raw data (extra) files.
 
 In short, it helps you to know what data you're working with, what you can do with it, and whether something has gone wrong.
 
@@ -73,21 +76,19 @@ ds.metadata
 
 To access the most common metadata fields:
 ```python
-ds.DESCR          # or ds.metadata['descr']
+ds.README          # or ds.metadata['descr']
 ds.LICENSE        # or ds.metadata['license']
 ds.HASHES         # or ds.metadata['hashes']
 ```
 ## The catalog
-While we do our best to keep the documentation in [Available Datasets](docs/available-datasets.md) up-to-date with what's in the code, you can explore all of the currently available `Datasets` via the `dataset_catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded.
+You can explore all of the currently available `Datasets` via the Dataset `Catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded.
 
 To access the catalog:
 
 ```python
-from {{ cookiecutter.module_name }} import workflow
-workflow.dataset_catalog(keys_only=True)
+from {{ cookiecutter.module_name }}.data import Catalog
+Catalog.load("datasets')
 ```
-If you're interested, set `keys_only=False` to see the complete contents of the metadata that is saved in the catalog.
-
 
 ## Sharing your Data as a `Dataset` object
 In order to convert your data to a `Dataset` object, you will need to generate a catalog *recipe*, that uses a custom *function for processing your raw data*. Doing so allows us to document all the munging, pre-processing, and data verification necessary to reproducibly build the dataset.
diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md
index ce9e87d..3eecd80 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md	
@@ -1,5 +1,5 @@
-# The Easydata Git Workflow
-Here's our suggestion for a reliable git workflow that works well in small team settings using [Easydata][cookiecutter-easydata].
+# The EasyData Git Workflow
+Here's our suggestion for a reliable git workflow that works well in small team settings using [Easydata][easydata].
 
 ## Git configuration
 
@@ -49,7 +49,7 @@ git merge {{cookiecutter.default_branch}}
 git push origin my_branch
 ```
 
-### Do I have any stale branches?
+### Clean up the junk
 With your local `{{cookiecutter.default_branch}}`, `origin/{{cookiecutter.default_branch}}` and `upstream/{{cookiecutter.default_branch}}` all in sync, we like to clean up any old branches that are fully merged (and hence, can be deleted without data loss.)
 ```bash
 git branch --merged {{cookiecutter.default_branch}}
@@ -58,15 +58,15 @@ git branch -d <name_of_merged_branch>
 A really great feature of `git branch -d` is that it will refuse to remove a branch that hasn't been fully merged into another. Thus it's safe to use without any fear of data loss.
 
 
-### Time to start the day
+### Start the day
 Once you've finished all your merge tasks, you can create a clean working branch from the latest `{{cookiecutter.default_branch}}` by doing a:
 ```bash
 git checkout {{cookiecutter.default_branch}}
 git checkout -b new_branch_name
 ```
 
+That's it! Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue in our
+[easydata issue tracker].
 
-That's it!. Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue at
-[cookiecutter-easydata].
-
-[cookiecutter-easydata]: https://github.com/hackalog/cookiecutter-easydata/
\ No newline at end of file
+[easydata issue tracker]: https://github.com/hackalog/easydata/issues
+[easydata]: https://github.com/hackalog/easydata
\ No newline at end of file
diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md
index f975369..4bae065 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md	
@@ -79,8 +79,7 @@ output_notebook(resources=INLINE)
 
 # Source module imports
 from {{ cookiecutter.module_name }} import paths
-from {{ cookiecutter.module_name }}.data import DataSource, Dataset
-from {{ cookiecutter.module_name }} import workflow
+from {{ cookiecutter.module_name }}.data import DataSource, Dataset, Catalog
 ```
 You can also find most of these header cells in [00-xyz-sample-notebook.ipynb](../notebooks/00-xyz-sample-notebook.ipynb)
 
@@ -99,6 +98,7 @@ There is a whole world of cell magics. These are bits of code that you can put a
 * [README](../README.md)
 * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
 * [Getting and Using Datasets](datasets.md)
+* [Specifying Paths in Easydata](paths.md)
 * [Using Notebooks for Analysis](notebooks.md)
 * [Sharing your Work](sharing-your-work.md)
 * [Troubleshooting Guide](troubleshooting.md)

From ef04660557385d1c9a30117f78b58bcb38f81e8d Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 17:08:56 -0800
Subject: [PATCH 03/36] sync with snp changes, especially descr -> readme and
 extra -> fileset

---
 .../__init__.py                               |   1 +
 .../{{ cookiecutter.module_name }}/_paths.py  |  39 ++--
 .../data/__init__.py                          |   2 +-
 .../data/datasets.py                          | 189 +++++++++------
 .../data/extra.py                             |  88 -------
 .../data/transformer_functions.py             |  74 ++++--
 .../{{ cookiecutter.module_name }}/helpers.py | 218 +++++++++++++++++-
 .../workflow.py                               |  11 +-
 8 files changed, 411 insertions(+), 211 deletions(-)
 delete mode 100644 {{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py

diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py
index cd3ea61..872135a 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py	
@@ -14,6 +14,7 @@
     'project_path': '${catalog_path}/..',
     'raw_data_path': '${data_path}/raw',
     'template_path': '${project_path}/reference/templates',
+    'abfs_cache': '${interim_data_path}/abfs_cache',
 }
 _catalog_file = _module_dir.parent / "catalog" / "config.ini"
 
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py
index 1c23d32..cc3a82c 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py	
@@ -1,7 +1,7 @@
 from .decorators import SingletonDecorator
 from .kvstore import KVStore
 from .log import logger
-import pathlib
+import pathlib import Path
 
 class PathStore(KVStore):
     """Persistent Key-Value store for project-level paths
@@ -13,15 +13,16 @@ class PathStore(KVStore):
 
     By default, the project directory is the parent of the directory containing the `config_file`:
 
-    >>> b['project_path']
-    PosixPath('/tmpx/project')
-    >>> b['data_path']
-    PosixPath('/tmpx/project/data')
+
+    >>> b['project_path'] == Path('/tmpx/project').resolve()
+    True
+    >>> b['data_path'] == Path('/tmpx/project/data').resolve()
+    True
 
     The `catalog_path` is set upon instantiation and is read-only:
 
-    >>> b['catalog_path']
-    PosixPath('/tmpx/project/catalog')
+    >>> b['catalog_path'] == Path('/tmpx/project/catalog').resolve()
+    True
     >>> b['catalog_path'] = '/tmp'
     Traceback (most recent call last):
      ...
@@ -30,21 +31,21 @@ class PathStore(KVStore):
     Changing a value changes all values that expand to contain it:
 
     >>> b['project_path'] = '/tmpy'
-    >>> b['project_path']
-    PosixPath('/tmpy')
-    >>> b['data_path']
-    PosixPath('/tmpy/data')
+    >>> b['project_path'] ==  Path('/tmpy').resolve()
+    True
+    >>> b['data_path'] == Path('/tmpy/data').resolve()
+    True
 
     We can have multiple levels of expansion:
 
     >>> b['raw_data_path'] = "${data_path}/raw"
-    >>> b['raw_data_path']
-    PosixPath('/tmpy/data/raw')
+    >>> b['raw_data_path'] == Path('/tmpy/data/raw').resolve()
+    True
     >>> b['project_path'] = '/tmp3'
-    >>> b['data_path']
-    PosixPath('/tmp3/data')
-    >>> b['raw_data_path']
-    PosixPath('/tmp3/data/raw')
+    >>> b['data_path'] == Path('/tmp3/data').resolve()
+    True
+    >>> b['raw_data_path'] == Path('/tmp3/data/raw').resolve()
+    True
     """
 
     # These keys should never be written to disk, though they may be used
@@ -58,7 +59,7 @@ def __init__(self, *args,
         if config_file is None:
             self._config_file = "config.ini"
         else:
-            self._config_file = pathlib.Path(config_file)
+            self._config_file = Path(config_file)
         self._usage_warning = False
         super().__init__(*args, config_section=config_section,
                          config_file=self._config_file, **kwargs)
@@ -88,7 +89,7 @@ def __getitem__(self, key):
         if key in self._protected:
             return getattr(self, key)
         self._read()
-        return pathlib.Path(super().__getitem__(key)).resolve()
+        return Path(super().__getitem__(key)).resolve()
 
     @property
     def catalog_path(self):
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py
index 4e7b43e..81d21fc 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py	
@@ -2,4 +2,4 @@
 from .datasets import *
 from .fetch import *
 from .utils import *
-from .extra import *
+from .fileset import *
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py
index 2fa411c..7f44b47 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py	
@@ -88,8 +88,10 @@ def __init__(self,
                  catalog_file='datasets.json',
                  **kwargs):
         """
-        Object representing a dataset object.
-        Notionally compatible with scikit-learn's Bunch object
+        EasyData Dataset container Object.
+
+        Contains metadata (README, LICENSE), associated file list (FILESET), and
+        optionally a data object.
 
         dataset_name: string (required)
             key to use for this dataset
@@ -99,7 +101,7 @@ def __init__(self,
             Either classification target or label to be used. for each of the points
             in `data`
         metadata: dict
-            Data about the object. Key fields include `license_txt`, `descr`, and `hashes`
+            Data about the object. Key fields include `license`, `readme`, and `hashes`
         update_hashes: Boolean
             If True, recompute the data/target hashes in the Metadata
         """
@@ -118,7 +120,7 @@ def __init__(self,
         self['metadata']['dataset_name'] = dataset_name
         self['data'] = data
         self['target'] = target
-        #self['extra'] = Extra.from_dict(metadata.get('extra', None))
+        #self['fileset'] = Fileset.from_dict(metadata.get('fileset', None))
         data_hashes = self._generate_data_hashes()
 
         if update_hashes:
@@ -153,10 +155,10 @@ def __setattr__(self, key, value):
             self['metadata'][key.lower()] = value
         elif key == 'name':
             self['metadata']['dataset_name'] = value
-        elif key in ['extra_base', 'extra_auth_kwargs']:
+        elif key in ['fileset_base', 'fileset_auth']:
             if self.name not in paths._config.sections():
                 paths._config.add_section(self.name)
-            if key == 'extra_auth_kwargs':
+            if key == 'fileset_auth':
                 paths._config.set(self.name, key, json.dumps(value, sort_keys=True))
             else:
                 paths._config.set(self.name, key, value)
@@ -170,7 +172,7 @@ def __delattr__(self, key):
             del self['metadata'][key.lower()]
         elif key == 'name':
             raise ValueError("name is mandatory")
-        elif key == 'extra_base':
+        elif key == 'fileset_base':
             if paths._config.has_section(self.name) and paths._config.has_option(self.name, key):
                 paths._config.remove_option(self.name, key)
                 paths._write()
@@ -226,26 +228,67 @@ def resolve_local_config(self, key, default=None, kind="string"):
             raise ValueError(f"Unknown kind: {kind}")
 
     @property
-    def extra_base(self):
-        return self.resolve_local_config("extra_base", paths['processed_data_path'] / f"{self.name}.extra")
+    def fileset_base(self):
+        return self.resolve_local_config("fileset_base", paths['processed_data_path'] / f"{self.name}.fileset")
 
     @property
-    def extra_auth_kwargs(self):
-        return self.resolve_local_config("extra_auth_kwargs", "{}", kind="json")
+    def fileset_auth(self):
+        return self.resolve_local_config("fileset_auth", "{}", kind="json")
+
+    def filesystem(self):
+        """Return an fsspec filesystem object associated with this fileset_base.
+
+        If present, the kwargs specified in 'Dataset.fileset_auth' will be used to authenticate the connection. These must be valid
+        parameters to 'fsspec.open()'
+
+        returns: fsspec.FileSystem object
+
+        """
+        f = fsspec.open(self.fileset_base, **self.fileset_auth)
+        return f.fs
+
+    def fileset(self, dirs_only=False):
+        """Enumerate contents of fileset.
+
+        Automatically prepends `fileset_base`
+
+        Parameters::
+            dirs_only: Boolean
+                if True, returns only directory names containing files
+                if False, returns files and their associated hashes
+
+                Useful for file formats that are actually directories, like parquet
+
+        Returns:
+            if dirs_only is True:
+                list of directories containing files in the fileset
+            else
+                tuples of filenames, hashlists for every file in the fileset
+        """
+        eb = self.fileset_base
+        sep = "/"
+        ret = []
+        for subdir, filedict in self.FILESET.items():
+            if dirs_only:
+                ret.append(sep.join([eb, subdir]))
+            else: # returns all files
+                for f, hashlist in filedict.items():
+                    ret.append((sep.join([eb, subdir, f]), hashlist))
+        return ret
 
     # Note: won't work because of set/setattr magic above
-    #@extra_base.deleter
-    #def extra_base(self):
-    #    if paths._config.has_section(self.name) and paths._config.has_option(self.name, "extra_base"):
-    #        paths._config.remove_option("extra_base")
+    #@fileset_base.deleter
+    #def fileset_base(self):
+    #    if paths._config.has_section(self.name) and paths._config.has_option(self.name, "fileset_base"):
+    #        paths._config.remove_option("fileset_base")
 
 
     # Note: Won't work because of setattr magic above
-    #@extra_base.setter
-    #def extra_base(self, val):
+    #@fileset_base.setter
+    #def fileset_base(self, val):
     #    if self.name not in paths._config.sections():
     #        paths._config.add_section(self.name)
-    #    paths._config.set(self.name, "extra_base", val)
+    #    paths._config.set(self.name, "fileset_base", val)
     #    paths._write()
     #    logger.debug(f"Writing {paths._config_file}")
 
@@ -579,22 +622,22 @@ def verify_hashes(self, hashdict=None, catalog_path=None):
             hashdict = c[self.name]["hashes"]
         return hashdict.items() <= self.metadata['hashes'].items()
 
-    def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, hash_types=['size']):
+    def verify_fileset(self, fileset_base=None, file_dict=None, return_filelists=False, hash_types=['size']):
         """
-        Verify that all files listed in the metadata EXTRA dict are accessible and have good hashes.
+        Verify that all files listed in the metadata FILESET dict are accessible and have good hashes.
 
         Returns boolean - True if all files are accessible and have good hashes - and optional
         file lists.
 
         Parameters
         ----------
-        extra_base: path or None
-           base for the EXTRA filenames.
+        fileset_base: path or None
+           base for the FILESET filenames.
            if passed as explicit parameter, this location will be used
-           if omitted, the dataset `extra_base` will be read (which checks the local_config,
-           or self.EXTRA_BASE, in that order)
-        file_dict: sub-dict of extra dict
-           if None, default to the whole extra dict
+           if omitted, the dataset `fileset_base` will be read (which checks the local_config,
+           or self.FILESET_BASE, in that order)
+        file_dict: sub-dict of fileset dict
+           if None, default to the whole fileset dict
         return_filelists: boolean, default False
            if True, returns triple (good_hashes, bad_hashes, missing_files)
            else, returns Boolean (all files good)
@@ -617,19 +660,19 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False,
             files that are inaccessible
 
         """
-        if extra_base is None:
-            extra_base = self.extra_base
-        extra_base = pathlib.Path(extra_base)
-        extra_dict = self.metadata.get('extra', None)
+        if fileset_base is None:
+            fileset_base = self.fileset_base
+        fileset_base = pathlib.Path(fileset_base)
+        fileset_dict = self.metadata.get('fileset', None)
         if file_dict is None:
-            file_dict = extra_dict
+            file_dict = fileset_dict
         else:
-            if not (file_dict.keys() <= extra_dict.keys()):
-                raise ValueError(f"file_dict must be a subset of the metadata['extra'] dict")
+            if not (file_dict.keys() <= fileset_dict.keys()):
+                raise ValueError(f"file_dict must be a subset of the metadata['fileset'] dict")
             else:
                 for key in file_dict.keys():
-                    if not (file_dict[key].items() <= extra_dict[key].items()):
-                        raise ValueError(f"file_dict must be a subset of the metadata['extra'] dict")
+                    if not (file_dict[key].items() <= fileset_dict[key].items()):
+                        raise ValueError(f"file_dict must be a subset of the metadata['fileset'] dict")
 
         retval = False
         bad_hash = []
@@ -641,7 +684,7 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False,
         else:
             for directory in file_dict.keys():
                 for file, meta_hash_list in file_dict[directory].items():
-                    path = extra_base / directory / file
+                    path = fileset_base / directory / file
                     rel_path = pathlib.Path(directory) / file
                     if path.exists():
                         disk_hash_list = []
@@ -660,52 +703,52 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False,
         else:
             return retval
 
-    def subselect_extra(self, rel_files):
-        """Convert a (relative) pathname to an EXTRA dict
+    def subselect_fileset(self, rel_files):
+        """Convert a (relative) pathname to an FILESET dict
 
-        Suitable for passing to verify_extra()
+        Suitable for passing to verify_fileset()
         """
-        extra_dict = defaultdict(dict)
+        fileset_dict = defaultdict(dict)
         for rel_file_path in rel_files:
             rel_path = pathlib.Path(rel_file_path)
             try:
-                hashlist = self.EXTRA[str(rel_path.parent)][rel_path.name]
+                hashlist = self.FILESET[str(rel_path.parent)][rel_path.name]
             except KeyError:
-                raise NotFoundError(f"Not in EXTRA: {rel_file_path}") from None
-            extra_dict[str(rel_path.parent)][rel_path.name] = hashlist
-        return dict(extra_dict)
+                raise NotFoundError(f"Not in FILESET: {rel_file_path}") from None
+            fileset_dict[str(rel_path.parent)][rel_path.name] = hashlist
+        return dict(fileset_dict)
 
-    def extra_file(self, relative_path):
-        """Convert a relative path (relative to extra_base) to a fully qualified location
+    def fileset_file(self, relative_path):
+        """Convert a relative path (relative to fileset_base) to a fully qualified location
 
-        extra_base may be prefixed with optional protocol like `s3://` and
+        fileset_base may be prefixed with optional protocol like `s3://` and
         is suitable for passing to fsspec.open_files()
 
         Parameters
         ----------
         relative_path: string or list
-            Relative filepath. Will be appended to extra_base (and an intervening '/' added as needed)
-            extra_base can be prefixed with a protocol like `s3://` to read from alternate filesystems.
+            Relative filepath. Will be appended to fileset_base (and an intervening '/' added as needed)
+            fileset_base can be prefixed with a protocol like `s3://` to read from alternate filesystems.
             To read from multiple files you can pass a globstring or a list of paths, with the caveat
             that they must all have the same protocol.
         """
-        extra_base = self.extra_base
-        if extra_base.startswith("/"):
-            fqpath =  str(pathlib.Path(extra_base) / relative_path)
-        elif extra_base.endswith('/'):
-            fqpath = f"{extra_base}{relative_path}"
+        fileset_base = self.fileset_base
+        if fileset_base.startswith("/"):
+            fqpath =  str(pathlib.Path(fileset_base) / relative_path)
+        elif fileset_base.endswith('/'):
+            fqpath = f"{fileset_base}{relative_path}"
         else:
-            fqpath = f"{extra_base}/{relative_path}"
+            fqpath = f"{fileset_base}/{relative_path}"
         return fqpath
 
-    def open_extra(self, relative_path, auth_kwargs=None, **kwargs):
-        """Given a path (relative to extra_base), return an fsspec.OpenFile object
+    def open_fileset(self, relative_path, auth_kwargs=None, **kwargs):
+        """Given a path (relative to fileset_base), return an fsspec.OpenFile object
 
         Parameters
         ----------
         relative_path: string or list
-            Relative filepath. Will be appended to extra_base (and an intervening '/' added as needed)
-            extra_base can be prefixed with a protocol like `s3://` to read from alternate filesystems.
+            Relative filepath. Will be appended to fileset_base (and an intervening '/' added as needed)
+            fileset_base can be prefixed with a protocol like `s3://` to read from alternate filesystems.
             To read from multiple files you can pass a globstring or a list of paths, with the caveat
             that they must all have the same protocol.
         auth_kwargs: dict or None
@@ -717,7 +760,7 @@ def open_extra(self, relative_path, auth_kwargs=None, **kwargs):
 
         Examples
         --------
-        >>> with ds.open_extra('2020-01-*.csv') as f:
+        >>> with ds.open_fileset('2020-01-*.csv') as f:
         ...    df = pd.read_csv(f)   # doctest: +SKIP
 
         Returns
@@ -726,11 +769,11 @@ def open_extra(self, relative_path, auth_kwargs=None, **kwargs):
         be used as a single context
         """
         if auth_kwargs is None:
-            auth_kwargs = self.extra_auth_kwargs
+            auth_kwargs = self.fileset_auth
         if auth_kwargs:
             logger.debug(f"Passing authentication information via auth_kwargs")
 
-        return fsspec.open(self.extra_file(relative_path), **auth_kwargs, **kwargs)
+        return fsspec.open(self.fileset_file(relative_path), **auth_kwargs, **kwargs)
 
     def dump(self, file_base=None, dump_path=None, hash_type='sha1',
              exists_ok=False, create_dirs=True, dump_metadata=True, update_catalog=True,
@@ -867,8 +910,8 @@ def __init__(self,
                     Value of hash used to verify file integrity
                 file_name: string (optional)
                     filename to use when saving file locally. If omitted, it will be inferred from url or source_file
-                name: string or {'DESCR', 'LICENSE'} (optional)
-                    description of the file. of DESCR or LICENSE, will be used as metadata
+                name: string or {'README', 'LICENSE'} (optional)
+                    description of the file. of README or LICENSE, will be used as metadata
                 unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
                     action to take in order to unpack this file. If None, infers from file type.
 
@@ -909,14 +952,14 @@ def file_list(self):
         logger.warning("file_list is deprecated. Use file_dict instead")
         return list(self.file_dict.values())
 
-    def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='DESCR', unpack_action='copy', force=False):
+    def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='README', unpack_action='copy', force=False):
         """Add metadata to a DataSource
 
         filename: create metadata entry from contents of this file. Relative to `metadata_path`
         contents: create metadata entry from this string
         metadata_path: (default `paths['raw_data_path']`)
             where to store metadata files
-        kind: {'DESCR', 'LICENSE'}
+        kind: {'README', 'LICENSE'}
         unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
             action to take in order to unpack this file. If None, infers from file type.
         force: boolean (default False)
@@ -928,7 +971,7 @@ def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='D
             metadata_path = pathlib.Path(metadata_path)
 
         filename_map = {
-            'DESCR': f'{self.name}.readme',
+            'README': f'{self.name}.readme',
             'LICENSE': f'{self.name}.license',
         }
         if kind not in filename_map:
@@ -1337,7 +1380,7 @@ def process(self,
         return_X_y: boolean
             if True, returns (data, target) instead of a `Dataset` object.
         use_docstring: boolean
-            If True, the docstring of `self.process_function` is used as the Dataset DESCR text.
+            If True, the docstring of `self.process_function` is used as the Dataset README text.
         """
         if not self.unpacked_:
             logger.debug("process() called before unpack()")
@@ -1373,13 +1416,13 @@ def process(self,
     def default_metadata(self, use_docstring=False):
         """Returns default metadata derived from this DataSource
 
-        This sets the dataset_name, and fills in `license` and `descr`
+        This sets the dataset_name, and fills in `license` and `readme`
         fields if they are present, either on disk, or in the file list
 
         Parameters
         ----------
         use_docstring: boolean
-            If True, the docstring of `self.process_function` is used as the Dataset DESCR text.
+            If True, the docstring of `self.process_function` is used as the Dataset README text.
 
         Returns
         -------
@@ -1388,12 +1431,12 @@ def default_metadata(self, use_docstring=False):
 
         metadata = {}
         optmap = {
-            'DESCR': 'descr',
+            'README': 'readme',
             'LICENSE': 'license',
         }
         filemap = {
             'license': f'{self.name}.license',
-            'descr': f'{self.name}.readme'
+            'readme': f'{self.name}.readme'
         }
 
         for key, fetch_dict in self.file_dict.items():
@@ -1406,7 +1449,7 @@ def default_metadata(self, use_docstring=False):
         if use_docstring:
             func = partial(self.process_function)
             fqfunc, invocation =  partial_call_signature(func)
-            metadata['descr'] =  f'Data processed by: {fqfunc}\n\n>>> ' + \
+            metadata['readme'] =  f'Data processed by: {fqfunc}\n\n>>> ' + \
               f'{invocation}\n\n>>> help({func.func.__name__})\n\n' + \
               f'{func.func.__doc__}'
 
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py
deleted file mode 100644
index 74419cb..0000000
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py	
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-Functions for handling "extra" data; i.e.  collections of raw files associated with a Dataset
-"""
-
-from collections import defaultdict
-import pathlib
-import shutil
-import os
-
-from tqdm.auto import tqdm
-
-from .. import paths
-from ..log import logger
-
-__all__ = [
-    'process_extra_files',
-]
-
-def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", extra_dir=".extra", dataset_dir=None, do_copy=False):
-    """
-    Process unpacked raw files into its minimal dataset components (data, target, metadata).
-    Here, 'minimal' means `data` and `target` will be None, and `extra` will contain a
-    file dict of files matching the specified file_glob (and their sizes).
-
-    Parameters
-    ----------
-    unpack_dir: default paths['interim_data_path']
-        The directory the interim data files have been unpacked into
-    dataset_dir: default paths['processed_data_path']
-        location of processed datasets.
-    extract_dir:
-        Name of the directory of the unpacked zip file containing the raw data files.
-        relative to unpack_dir
-    file_glob: string
-        Add only files matching this glob pattern to EXTRA
-    extra_dir: string
-        Used in building the file_dict keys.
-    do_copy: boolean
-        if True, actually copy the files. Otherwise just build EXTRA
-
-    Returns
-    -------
-    (data, target, additional_metadata)
-
-    where
-
-    data and target are None,
-
-    metadata contains a file dict; i.e.
-    'extra': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...}
-    """
-    if metadata is None:
-        metadata = {}
-
-    if dataset_dir is None:
-        dataset_dir = paths['processed_data_path']
-    else:
-        dataset_dir = pathlib.Path(dataset_dir)
-    if unpack_dir is None:
-        unpack_dir = paths['interim_data_path']
-    else:
-        unpack_dir = pathlib.Path(unpack_dir)
-    if extract_dir is not None:
-        unpack_dir /= extract_dir
-
-    extra_dir = pathlib.Path(extra_dir)
-    extra_dir_fq = dataset_dir / extra_dir
-    logger.debug(f"Do copy: {do_copy}")
-    if do_copy:
-        if extra_dir_fq.is_dir():
-            logger.warning(f"Cleaning contents of {extra_dir}")
-            shutil.rmtree(extra_dir_fq)
-            logger.debug(f"Copying files to {extra_dir_fq}...")
-
-    file_dict = defaultdict(dict)
-    files = sorted(list(unpack_dir.rglob(file_glob)))
-    for i, file in enumerate(tqdm(files)):
-        if file.is_dir():
-            continue
-        relative_path = file.relative_to(unpack_dir)
-        extra_path = extra_dir / relative_path
-        file_dict[str(extra_path.parent)][str(extra_path.name)] = [f'size:{os.path.getsize(file)}']
-        if do_copy:
-            os.makedirs(dataset_dir / extra_path.parent, exist_ok=True)
-            shutil.copyfile(file, dataset_dir / extra_path)
-    metadata['extra'] = dict(file_dict)
-
-    return None, None, metadata
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py
index 615a3bc..525ca5d 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py	
@@ -12,10 +12,11 @@
 from ..utils import run_notebook
 
 __all__ = [
-    'run_notebook_transformer',
     'apply_single_function',
+    'copy_dataset',
     'csv_to_pandas',
     'new_dataset',
+    'run_notebook_transformer',
     'sklearn_train_test_split',
     'sklearn_transform',
 ]
@@ -163,23 +164,23 @@ def csv_to_pandas(ds_dict, *, output_map, **opts):
     new_ds = {}
     df = None
     for ds_name, dset in ds_dict.items():
-        extra = dset.metadata.get('extra', None)
-        if extra is not None:
-            logger.debug(f"Input dataset {ds_name} has extra data. Processing...")
-            for rel_dir, file_dict in extra.items():
+        fileset = dset.metadata.get('fileset', None)
+        if fileset is not None:
+            logger.debug(f"Input dataset {ds_name} has fileset data. Processing...")
+            for rel_dir, file_dict in fileset.items():
                 for new_dsname, csv_filename in output_map.items():
                     if csv_filename in file_dict:
                         logger.debug(f"Found {csv_filename}. Creating {new_dsname} dataset")
                         path = paths['processed_data_path'] / rel_dir / csv_filename
                         df = pd.read_csv(path)
                         new_metadata = dset.metadata
-                        new_metadata.pop('extra', None)
+                        new_metadata.pop('fileset', None)
                         new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=df, metadata=new_metadata)
     return new_ds
 
 
 
-def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_descr_txt, drop_extra, **opts):
+def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_readme_txt, drop_fileset, **opts):
     """
     Parameters
     ----------
@@ -189,12 +190,12 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali
         name of the dataset that the new dataset will be derived from
     dataset_name:
         name of the new dataset_catalog
-    added_descr_txt: Default None
-        new description text to be appended to the metadata descr
+    added_readme_txt: Default None
+        new description text to be appended to the metadata readme
     serialized_function:
         function (serialized by src.utils.serialize_partial) to run on .data to produce the new .data
-    drop_extra: boolean
-        drop the .extra part of the metadata
+    drop_fileset: boolean
+        drop the .fileset part of the metadata
     **opts:
         Remaining options will be ignored
     """
@@ -205,10 +206,10 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali
     ds = ds_dict.get(source_dataset_name)
 
     new_metadata = ds.metadata.copy()
-    new_metadata['descr'] += added_descr_txt
-    if drop_extra:
-        if new_metadata.get('extra', 0) != 0:
-            new_metadata.pop('extra')
+    new_metadata['readme'] += added_readme_txt
+    if drop_fileset:
+        if new_metadata.get('fileset', 0) != 0:
+            new_metadata.pop('fileset')
 
     logger.debug(f"Applying data function...")
     data_function=deserialize_partial(serialized_function)
@@ -227,3 +228,46 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali
 
     new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=preprocessed_corpus, metadata=new_metadata)
     return new_ds
+
+def copy_dataset(ds_dict, *, source_dataset_name, dataset_name, added_readme_txt, drop_fileset=True, **opts):
+    """
+    Create a new dataset by copying an existing one
+    Parameters
+    ----------
+    ds_dict:
+        input datasets.
+    source_dataset_name:
+        name of the dataset that the new dataset will be derived from
+    dataset_name:
+        name of the new dataset_catalog
+    added_readme_txt: Default None
+        new description text to be appended to the metadata readme
+    drop_fileset: boolean
+        drop the .fileset part of the metadata
+    **opts:
+        Remaining options will be ignored
+    """
+
+    new_ds = {}
+
+    logger.debug(f"Loading {source_dataset_name}...")
+    ds = ds_dict.get(source_dataset_name)
+
+    new_metadata = ds.metadata.copy()
+    new_metadata['readme'] += added_readme_txt
+    if drop_fileset:
+        if new_metadata.get('fileset', 0) != 0:
+            new_metadata.pop('fileset')
+
+    if drop_data:
+        new_data = None
+    else:
+        new_data = ds.data.copy()
+
+    if drop_target:
+        new_target = None
+    else:
+        new_target = ds.target.copy()
+
+    new_ds[dataset_name] = Dataset(dataset_name=dataset_name, data=new_data, target=new_target, metadata=new_metadata)
+    return new_ds
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py
index 1283bab..f103370 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py	
@@ -1,7 +1,9 @@
 ## Script common ways of adding a dataset to the workflow
 
 from functools import partial
+import fsspec
 import pathlib
+import os
 
 from .log import logger
 from . import paths
@@ -10,7 +12,7 @@
 from .data import (DataSource, Dataset, hash_file, DatasetGraph, Catalog,
                serialize_transformer_pipeline)
 from .data.transformer_functions import csv_to_pandas, new_dataset, apply_single_function, run_notebook_transformer
-from .data.extra import process_extra_files
+from .data.fileset import process_fileset_files
 from .data.utils import serialize_partial
 
 __all__ = [
@@ -90,7 +92,7 @@ def notebook_as_transformer(notebook_name, *,
 
 # Create a Dataset from a single csv file
 def dataset_from_csv_manual_download(ds_name, csv_path, download_message,
-                                     license_str, descr_str, *, hash_type='sha1',
+                                     license_str, readme_str, *, hash_type='sha1',
                                      hash_value=None,
                                      overwrite_catalog=False,):
     """
@@ -107,7 +109,7 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message,
         Hash, computed via the algorithm specified in `hash_type`
     license_str: str
         Contents of metadata license as text
-    descr_str:
+    readme_str:
         Contents of the metadata description as text
     overwrite_catalog: boolean
         If True, existing entries in datasets and transformers catalogs will be
@@ -136,15 +138,15 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message,
                              hash_value=hash_value,
                              unpack_action='copy',
                              force=True)
-    dsrc.add_metadata(contents=descr_str, force=True)
+    dsrc.add_metadata(contents=readme_str, force=True)
     dsrc.add_metadata(contents=license_str, kind='LICENSE', force=True)
 
-    process_function = process_extra_files
-    process_function = process_extra_files
+    process_function = process_fileset_files
+    process_function = process_fileset_files
     process_function_kwargs = {'do_copy':True,
                                'file_glob':str(csv_path.name),
-                               'extra_dir': raw_ds_name+'.extra',
-                               'extract_dir': raw_ds_name}
+                               'fileset_dir': raw_ds_name+'.fileset',
+                               'filesetct_dir': raw_ds_name}
     dsrc.process_function = partial(process_function, **process_function_kwargs)
     datasource_catalog = Catalog.load('datasources')
     datasource_catalog[dsrc.name] = dsrc.to_dict()
@@ -202,7 +204,7 @@ def dataset_from_metadata(dataset_name, metadata=None, overwrite_catalog=False):
     return ds
 
 
-def dataset_from_single_function(*, source_dataset_name, dataset_name, data_function, added_descr_txt, drop_extra=True, overwrite_catalog=False):
+def dataset_from_single_function(*, source_dataset_name, dataset_name, data_function, added_readme_txt, drop_fileset=True, overwrite_catalog=False):
     """
     Create a derived dataset (dataset_name) via a single function call on .data from a
     previous dataset (source_dataset_name).
@@ -213,8 +215,8 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func
         name of the dataset that the new dataset will be derived from
     dataset_name:
         name of the new dataset_catalog
-    added_descr_txt: Default None
-        new description text to be appended to the metadata descr
+    added_readme_txt: Default None
+        new description text to be appended to the metadata readme
     data_function:
         function (from src module) to run on .data to produce the new .data
     overwrite_catalog: boolean
@@ -223,7 +225,7 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func
     dag = DatasetGraph(catalog_path=paths['catalog_path'])
     serialized_function = serialize_partial(data_function)
     transformers = [partial(apply_single_function, source_dataset_name=source_dataset_name, dataset_name=dataset_name,
-                            serialized_function=serialized_function, added_descr_txt=added_descr_txt, drop_extra=drop_extra)]
+                            serialized_function=serialized_function, added_readme_txt=added_readme_txt, drop_fileset=drop_fileset)]
     dag.add_edge(input_dataset=source_dataset_name,
                  output_dataset=dataset_name,
                  transformer_pipeline=serialize_transformer_pipeline(transformers),
@@ -231,3 +233,195 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func
     ds = Dataset.from_catalog(dataset_name)
     logger.debug(f"{dataset_name} added to catalog")
     return ds
+
+def derived_dataset(*, dataset_name, source_dataset_name, added_readme_txt,
+                    drop_fileset=True, drop_data=True, drop_target=False,
+                    overwrite_catalog=False):
+    """
+    Create a derived dataset (dataset_name) via a single function call on .data from a
+    previous dataset (source_dataset_name).
+
+    Parameters
+    ----------
+    source_dataset_name:
+        name of the dataset that the new dataset will be derived from
+    dataset_name:
+        name of the new dataset_catalog
+    added_readme_txt: Default None
+        new description text to be appended to the metadata readme
+    drop_fileset: boolean
+        If True, don't copy fileset data to new dataset
+    drop_data: boolean
+        If True, don't copy data to new dataset
+    drop_target: boolean
+        If True, don't copy target to new dataset
+    overwrite_catalog: boolean
+        if True, existing entries in datasets and transformers catalogs will be overwritten
+    """
+    dag = DatasetGraph(catalog_path=paths['catalog_path'])
+    serialized_function = serialize_partial(data_function)
+    transformers = [partial(copy_dataset, source_dataset_name=source_dataset_name, dataset_name=dataset_name,
+                            added_readme_txt=added_readme_txt, drop_fileset=drop_fileset, drop_data=drop_data, drop_target=drop_target)]
+    dag.add_edge(input_dataset=source_dataset_name,
+                 output_dataset=dataset_name,
+                 transformer_pipeline=serialize_transformer_pipeline(transformers),
+                 overwrite_catalog=overwrite_catalog)
+    ds = Dataset.from_catalog(dataset_name)
+    logger.debug(f"{dataset_name} added to catalog")
+    return ds
+
+def metadata_from_fsspec(fs, path, metadata=None, fileset=None):
+    """Create metadata, FILESET file list from fsspec URL.
+
+    Creates a metadata dict representing a dataset
+
+    + filenames in all uppercase are assumed to be metadata fields
+    + remaining files are used to populate FILESET data and have their hashes computed.
+
+    Parameters
+    ----------
+    fs:
+        fsspec.filesystem instance (already connected)
+    path:
+        relative to fs
+    metadata:
+        current contents of metadata dict.
+        Metadata obtained from fsurl will overwrite any similarly named fields in this dict
+    fileset:
+        Current contents of FILESET. new data will be appended.
+        Similarly named entries will be overwritten.
+
+    returns metadata dict
+    """
+    # There's a chance this should get rewritten to use 'fsspec.walk'
+
+    if metadata is None:
+        metadata = {}
+    if fileset is None:
+        fileset = metadata.get('fileset', {})
+    protocol = fs.protocol
+    dirs_done = []
+    dirs = [path]
+
+    while dirs:
+        dirname = dirs.pop()
+        rel_dirname = os.path.relpath(dirname, start=path)
+        dirs_done.append(dirname)
+        for file_info in fs.ls(dirname, detail=True):
+            file_type = file_info.get('type', None)
+            file_name = file_info['name']
+            if file_type == 'directory':
+                dirs.append(file_name)
+            elif file_type == 'file':
+                basename = os.path.basename(os.path.normpath(file_name))
+                if str.isupper(basename):
+                    # Add to metadata
+                    with fs.open(file_name, 'r') as fr:
+                        contents = '\n'.join(fr.readlines())
+                    metadata[str.lower(basename)] = contents
+                else:
+                    # add file and hash to FILESET
+                    if protocol == "abfs":
+                        # Cheap way to get md5
+                        md5_arr = file_info['content_settings']['content_md5']
+                        hashval = f"md5:{''.join('{:02x}'.format(x) for x in md5_arr)}"
+                    else:
+                        logger.warning(f"Unsupported fsspec filesystem: {fs.protocol}. Using size as hash")
+                        hashval = f"size:{fs.size(file_name)}"
+                    rel_path = os.path.relpath(file_info['name'], start=dirname) or "."
+                    # fileset[rel_dirname][rel_path] = [hashval]
+                    entry = {rel_path:[hashval]}
+                    fileset.setdefault(rel_dirname,{}).update(entry)
+            else:
+                raise Exception(f"Unknown file type: {file_type}")
+    metadata["fileset"] = fileset
+    return metadata
+
+
+
+def dataset_from_fsurl(fsurl,
+                       dataset_name=None,
+                       fsspec_auth=None,
+                       metadata=None,
+                       fileset=None,
+                       overwrite_catalog=True):
+    """Create a dataset from the contents of an fsspec URL
+
+    'fsurl' is assumed to be a directory/container/bucket.
+
+    Files in this bucket with names entirely in UPPERCASE are assumed
+    to be textfiles and are used to populate metadata fields directly
+    as metadata fields (e.g. README, LICENSE)
+
+    Other files have their hashes added to FILESET, and are included in
+    the FileSet (FILESET data) associated with the dataset.
+
+    Parameters::
+
+    fsurl: fsspec URL
+        Should be a "directory", container, or "subdirectory" of said container.
+    dataset_name: string or None
+        Name to use for Dataset.
+        if None, name is the last component of the fsurl path
+    metadata:
+        current contents of metadata dict.
+        Metadata obtained from fsurl will overwrite any similarly named fields in this dict
+    fileset:
+        Current contents of FILESET. new data will be appended.
+        Similarly named entries will be overwritten.
+    overwrite_catalog: Boolean
+        if True, entry in Dataset catalog will be overwritten with the newly generated Dataset
+
+    Returns::
+    Dataset containing only metadata and FILESET info for all files in the specified fsspec URL.
+
+    """
+    if fsspec_auth is None:
+        fsspec_auth = {}
+
+    f = fsspec.open(fsurl, **fsspec_auth)
+    path = f.path
+    if dataset_name is None:
+        dataset_name = os.path.basename(os.path.normpath(path))
+        logger.debug(f"Inferring dataset_name from fsurl: {dataset_name}")
+    fs = f.fs
+    protocol = fs.protocol
+    meta = metadata_from_fsspec(fs, path, metadata=metadata, fileset=fileset)
+    meta['fileset_base'] = fsurl
+    ds = dataset_from_metadata(dataset_name,
+                               metadata=meta,
+                               overwrite_catalog=overwrite_catalog)
+    return ds
+
+def derived_dataset(*, dataset_name, source_dataset, added_readme_txt=None, drop_fileset=True, data=None, target=None):
+    """Create a dataset by copying its metadata from another dataset
+
+    Parameters
+    ----------
+    added_readme_txt: string
+        String to be appended to the end of the new dataset's README metadata
+    drop_fileset: boolean
+        if True, ignore fileset when copying metadata
+    data:
+        Will be used as contents of new dataset's `data`
+    target:
+        Will be used as contents of new dataset's `target`
+    dataset_name: String
+        new dataset name
+    source_dataset: Dataset
+        Metadata will be copied from this dataset
+
+    Returns
+    -------
+    new (derived) Dataset object
+    """
+    new_metadata = ds.metadata.copy()
+    if added_readme_txt:
+        new_metadata['readme'] += added_readme_txt
+    if drop_fileset:
+        if new_metadata.get('fileset', 0) != 0:
+            new_metadata.pop('fileset')
+    if new_metadata.get('hashes', 0) != 0:
+            new_metadata.pop('hashes')
+    ds_out = Dataset(dataset_name, metadata=new_metadata, data=data, target=target, **kwargs)
+    return ds_out
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py
index df901a9..03c6d44 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py	
@@ -1,6 +1,11 @@
-# Workflow is where we patch around API issues in between releases.
-# Nothing in this file is intended to be a stable API. use at your own risk,
-# as its contents will be regularly deprecated
+"""A module where we temporarily smooth our way around API issues in Easydata.
+
+This is a place where we temporarily address UX and API issues in Easydata, usually by writing convenient wrappers around existing functionality.
+
+Nothing in here is intended to be a stable API, so use at your own risk, as these contents are regularly deprecated.
+
+"""
+
 import sys
 import logging
 from .data import Catalog, Dataset, DataSource

From 58db18895d720f6ea7fc82d9f114d4556de3a7ee Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 17:33:00 -0800
Subject: [PATCH 04/36] fix typos

---
 .../reference/easydata/datasets.md                        | 6 +++---
 .../reference/easydata/git-workflow.md                    | 2 +-
 .../reference/easydata/notebooks.md                       | 2 +-
 .../{{ cookiecutter.module_name }}/_paths.py              | 2 +-
 .../data/transformer_functions.py                         | 8 --------
 .../{{ cookiecutter.module_name }}/helpers.py             | 7 +++++--
 6 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md
index 2cd4687..ff5923c 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md	
@@ -49,7 +49,7 @@ The `metadata` is where the magic lives. It serves several purposes in terms of
 * it includes `HASHES`, which **improve data reproducibility**, since what you download and process gets checked each step along the way to ensure the raw data matches what is stored in the `dataset_catalog`,
 * it provides easy access to **what the data is** via the `README` attribute,
 * it provides easy (and continual) **access to the license / usage restrictions** for the data (the `LICENSE` attribute), which helps with knowing what you can do when [Sharing your Work](sharing-your-work.md).
-* it provides the **fileset data manifest**, `FILESET`, if your dataset includes around additional raw data (extra) files.
+* it provides the **fileset data manifest**, `FILESET`, if your dataset includes around additional raw data (fileset) files.
 
 In short, it helps you to know what data you're working with, what you can do with it, and whether something has gone wrong.
 
@@ -76,7 +76,7 @@ ds.metadata
 
 To access the most common metadata fields:
 ```python
-ds.README          # or ds.metadata['descr']
+ds.README          # or ds.metadata['readme']
 ds.LICENSE        # or ds.metadata['license']
 ds.HASHES         # or ds.metadata['hashes']
 ```
@@ -87,7 +87,7 @@ To access the catalog:
 
 ```python
 from {{ cookiecutter.module_name }}.data import Catalog
-Catalog.load("datasets')
+Catalog.load("datasets")
 ```
 
 ## Sharing your Data as a `Dataset` object
diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md
index 3eecd80..50d5179 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md	
@@ -1,5 +1,5 @@
 # The EasyData Git Workflow
-Here's our suggestion for a reliable git workflow that works well in small team settings using [Easydata][easydata].
+Here's our suggestion for a reliable git workflow that works well in **small team settings**; e.g. when using [Easydata][easydata] in a group setting.
 
 ## Git configuration
 
diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md
index 4bae065..270775c 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md	
@@ -96,7 +96,7 @@ There is a whole world of cell magics. These are bits of code that you can put a
 ### Quick References
 
 * [README](../README.md)
-* [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md)
+* [Setting up and Maintaining your Conda Environment, Reproducibly](conda-environments.md)
 * [Getting and Using Datasets](datasets.md)
 * [Specifying Paths in Easydata](paths.md)
 * [Using Notebooks for Analysis](notebooks.md)
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py
index cc3a82c..b938c9f 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py	
@@ -1,7 +1,7 @@
 from .decorators import SingletonDecorator
 from .kvstore import KVStore
 from .log import logger
-import pathlib import Path
+from pathlib import Path
 
 class PathStore(KVStore):
     """Persistent Key-Value store for project-level paths
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py
index 525ca5d..7cdf6ad 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py	
@@ -178,8 +178,6 @@ def csv_to_pandas(ds_dict, *, output_map, **opts):
                         new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=df, metadata=new_metadata)
     return new_ds
 
-
-
 def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_readme_txt, drop_fileset, **opts):
     """
     Parameters
@@ -223,12 +221,6 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali
     new_ds[dataset_name] = Dataset(dataset_name=dataset_name, data=new_data, target=new_target, metadata=new_metadata)
     return new_ds
 
-
-    new_metadata = ds.metadata.copy()
-
-    new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=preprocessed_corpus, metadata=new_metadata)
-    return new_ds
-
 def copy_dataset(ds_dict, *, source_dataset_name, dataset_name, added_readme_txt, drop_fileset=True, **opts):
     """
     Create a new dataset by copying an existing one
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py
index f103370..186704c 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py	
@@ -16,10 +16,13 @@
 from .data.utils import serialize_partial
 
 __all__ = [
-    'notebook_as_transformer',
     'dataset_from_csv_manual_download',
+    'dataset_from_fsurl',
     'dataset_from_metadata',
     'dataset_from_single_function',
+    'derived_dataset',
+    'metadata_from_fsspec',
+    'notebook_as_transformer',
 ]
 
 
@@ -146,7 +149,7 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message,
     process_function_kwargs = {'do_copy':True,
                                'file_glob':str(csv_path.name),
                                'fileset_dir': raw_ds_name+'.fileset',
-                               'filesetct_dir': raw_ds_name}
+                               'extract_dir': raw_ds_name}
     dsrc.process_function = partial(process_function, **process_function_kwargs)
     datasource_catalog = Catalog.load('datasources')
     datasource_catalog[dsrc.name] = dsrc.to_dict()

From 6c6d7ef7a1ae25249a48ad62da54cf10a73cbc67 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 17:39:03 -0800
Subject: [PATCH 05/36] add missing file

---
 .../data/fileset.py                           | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 {{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py

diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py
new file mode 100644
index 0000000..f69aced
--- /dev/null
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py	
@@ -0,0 +1,88 @@
+"""
+Functions for handling "fileset" data; i.e.  collections of raw files associated with a Dataset
+"""
+
+from collections import defaultdict
+import pathlib
+import shutil
+import os
+
+from tqdm.auto import tqdm
+
+from .. import paths
+from ..log import logger
+
+__all__ = [
+    'process_fileset_files',
+]
+
+def process_fileset_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", fileset_dir=".fileset", dataset_dir=None, do_copy=False):
+    """
+    Process unpacked raw files into its minimal dataset components (data, target, metadata).
+    Here, 'minimal' means `data` and `target` will be None, and `fileset` will contain a
+    file dict of files matching the specified file_glob (and their sizes).
+
+    Parameters
+    ----------
+    unpack_dir: default paths['interim_data_path']
+        The directory the interim data files have been unpacked into
+    dataset_dir: default paths['processed_data_path']
+        location of processed datasets.
+    extract_dir:
+        Name of the directory of the unpacked zip file containing the raw data files.
+        relative to unpack_dir
+    file_glob: string
+        Add only files matching this glob pattern to FILESET
+    fileset_dir: string
+        Used in building the file_dict keys.
+    do_copy: boolean
+        if True, actually copy the files. Otherwise just build FILESET
+
+    Returns
+    -------
+    (data, target, additional_metadata)
+
+    where
+
+    data and target are None,
+
+    metadata contains a file dict; i.e.
+    'fileset': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...}
+    """
+    if metadata is None:
+        metadata = {}
+
+    if dataset_dir is None:
+        dataset_dir = paths['processed_data_path']
+    else:
+        dataset_dir = pathlib.Path(dataset_dir)
+    if unpack_dir is None:
+        unpack_dir = paths['interim_data_path']
+    else:
+        unpack_dir = pathlib.Path(unpack_dir)
+    if extract_dir is not None:
+        unpack_dir /= extract_dir
+
+    fileset_dir = pathlib.Path(fileset_dir)
+    fileset_dir_fq = dataset_dir / fileset_dir
+    logger.debug(f"Do copy: {do_copy}")
+    if do_copy:
+        if fileset_dir_fq.is_dir():
+            logger.warning(f"Cleaning contents of {fileset_dir}")
+            shutil.rmtree(fileset_dir_fq)
+            logger.debug(f"Copying files to {fileset_dir_fq}...")
+
+    file_dict = defaultdict(dict)
+    files = sorted(list(unpack_dir.rglob(file_glob)))
+    for i, file in enumerate(tqdm(files)):
+        if file.is_dir():
+            continue
+        relative_path = file.relative_to(unpack_dir)
+        fileset_path = fileset_dir / relative_path
+        file_dict[str(fileset_path.parent)][str(fileset_path.name)] = [f'size:{os.path.getsize(file)}']
+        if do_copy:
+            os.makedirs(dataset_dir / fileset_path.parent, exist_ok=True)
+            shutil.copyfile(file, dataset_dir / fileset_path)
+    metadata['fileset'] = dict(file_dict)
+
+    return None, None, metadata

From 210e7c5b50d00e5c6dad0f7184a15847fa89294e Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 18:00:02 -0800
Subject: [PATCH 06/36] fix test dataset generation

---
 .../data/process_functions.py                 | 38 +++++++++++
 .../tests/make_test_datasets.py               | 67 +++++--------------
 2 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py
index 6054735..31cbb1e 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py	
@@ -3,6 +3,7 @@
 """
 
 import pathlib
+from sklearn.datasets import fetch_20newsgroups
 
 from tqdm.auto import tqdm
 
@@ -10,4 +11,41 @@
 from ..log import logger
 
 __all__ = [
+    'process_20_newsgroups'
 ]
+
+def process_20_newsgroups(*, extract_dir='20_newsgroups',
+                          metadata=None, unpack_dir=None,
+                          opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}):
+    """
+    Process 20 newsgroups into (data, target, metadata) format.
+
+
+    Parameters
+    ----------
+    unpack_dir: path
+        The interim parent directory the dataset files have been unpacked into.
+    extract_dir: str
+        Name of the directory of the unpacked files relative to the unpack_dir. Note that
+    opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"}
+        Options to pass to sklearn.datasets.fetch_20newsgroups.
+
+
+    Returns
+    -------
+    A tuple:
+        (data, target, additional_metadata)
+
+    """
+    if metadata is None:
+        metadata = {}
+
+    if unpack_dir is None:
+        unpack_dir = paths['interim_data_path']
+    else:
+        unpack_dir = pathlib.Path(unpack_dir)
+    data_dir = unpack_dir / f"{extract_dir}"
+
+    news = fetch_20newsgroups(**opts)
+
+    return news.data, news.target, metadata
diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py
index 056f497..6e8d66e 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py	
@@ -1,16 +1,13 @@
 from sklearn.datasets import fetch_20newsgroups
 from functools import partial
 
-from {{ cookiecutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog
-from {{ cookiecutter.module_name }} import workflow, paths
-from {{ cookiecutter.module_name }}.log import logger
+from src.data import DataSource, Dataset, DatasetGraph, Catalog
+from src.data.process_functions import process_20_newsgroups
+from src import paths
+from src.log import logger
 
 # Set up a 20 newsgroups dataset
 
-ds_name = '20_newsgroups'
-output_ds_name = ds_name
-dsrc = DataSource(ds_name)
-
 license = """
 Custom Academic License: "You may use this material free of charge for any educational purpose, provided attribution is given in any lectures or publications that make use of this material." As in http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.data.html.
 """
@@ -46,51 +43,19 @@
 
 By default we follow the sklearn suggestion to set `remove=('headers', 'footers', 'quotes')` to avoid overfitting.
 """
+if __name__ =='__main__':
+    ds_name = '20_newsgroups'
+    output_ds_name = ds_name
+    dsrc = DataSource(ds_name)
 
-dsrc.add_metadata(contents=metadata, force=True)
-dsrc.add_metadata(contents=license, kind='LICENSE', force=True)
-
-def process_20_newsgroups(*, extract_dir='20_newsgroups',
-                          metadata=None, unpack_dir=None,
-                          opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}):
-    """
-    Process 20 newsgroups into (data, target, metadata) format.
-
-
-    Parameters
-    ----------
-    unpack_dir: path
-        The interim parent directory the dataset files have been unpacked into.
-    extract_dir: str
-        Name of the directory of the unpacked files relative to the unpack_dir. Note that
-    opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"}
-        Options to pass to sklearn.datasets.fetch_20newsgroups.
-
-
-    Returns
-    -------
-    A tuple:
-        (data, target, additional_metadata)
-
-    """
-    if metadata is None:
-        metadata = {}
-
-    if unpack_dir is None:
-        unpack_dir = paths['interim_data_path']
-    else:
-        unpack_dir = pathlib.Path(unpack_dir)
-    data_dir = unpack_dir / f"{extract_dir}"
-
-    news = fetch_20newsgroups(**opts)
-
-    return news.data, news.target, metadata
+    dsrc.add_metadata(contents=metadata, force=True)
+    dsrc.add_metadata(contents=license, kind='LICENSE', force=True)
 
-process_function = process_20_newsgroups
-process_kwargs = {}
+    process_function = process_20_newsgroups
+    process_kwargs = {}
 
-dsrc.process_function = partial(process_function, **process_kwargs)
-dsrc.update_catalog()
+    dsrc.process_function = partial(process_function, **process_kwargs)
+    dsrc.update_catalog()
 
-dag = DatasetGraph()
-dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True)
+    dag = DatasetGraph()
+    dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True)

From 944c39477209f8d365133d1e15f60c567dd83397 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 18:02:25 -0800
Subject: [PATCH 07/36] remove use of src

---
 .../tests/make_test_datasets.py                           | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py
index 6e8d66e..90db289 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py	
@@ -1,10 +1,10 @@
 from sklearn.datasets import fetch_20newsgroups
 from functools import partial
 
-from src.data import DataSource, Dataset, DatasetGraph, Catalog
-from src.data.process_functions import process_20_newsgroups
-from src import paths
-from src.log import logger
+from {{ cookicutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog
+from {{ cookicutter.module_name }}.data.process_functions import process_20_newsgroups
+from {{ cookicutter.module_name }} import paths
+from {{ cookicutter.module_name }}.log import logger
 
 # Set up a 20 newsgroups dataset
 

From 4f2d1189b8fb5b2989f7834373b93aea3bf22258 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 18:03:43 -0800
Subject: [PATCH 08/36] fix typo

---
 .../tests/make_test_datasets.py                           | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py
index 90db289..31f55a8 100644
--- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py	
+++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py	
@@ -1,10 +1,10 @@
 from sklearn.datasets import fetch_20newsgroups
 from functools import partial
 
-from {{ cookicutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog
-from {{ cookicutter.module_name }}.data.process_functions import process_20_newsgroups
-from {{ cookicutter.module_name }} import paths
-from {{ cookicutter.module_name }}.log import logger
+from {{ cookiecutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog
+from {{ cookiecutter.module_name }}.data.process_functions import process_20_newsgroups
+from {{ cookiecutter.module_name }} import paths
+from {{ cookiecutter.module_name }}.log import logger
 
 # Set up a 20 newsgroups dataset
 

From 6f60931f215e78c0f5f54a9feed89d7402e0670a Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 18:47:38 -0800
Subject: [PATCH 09/36] try using a miniconda image

---
 .../.circleci/config.yml                        | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/.circleci/config.yml b/{{ cookiecutter.repo_name }}/.circleci/config.yml
index 86db8c0..10f4984 100644
--- a/{{ cookiecutter.repo_name }}/.circleci/config.yml	
+++ b/{{ cookiecutter.repo_name }}/.circleci/config.yml	
@@ -8,7 +8,8 @@ jobs:
     docker:
       # specify the version you desire here
       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
-      - image: circleci/python:3.7.0
+      - image: continuumio/miniconda3
+
 
       # Specify service dependencies here if necessary
       # CircleCI maintains a library of pre-built images
@@ -20,13 +21,13 @@ jobs:
     steps:
       - checkout
 
-      - run:
-          name: Set up Anaconda
-          command: |
-            wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh;
-            chmod +x ~/miniconda.sh;
-            ~/miniconda.sh -b -p ~/miniconda;
-            echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV;
+      #- run:
+          #name: Set up Anaconda
+          #command: |
+          #  wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh;
+          #  chmod +x ~/miniconda.sh;
+          #  ~/miniconda.sh -b -p ~/miniconda;
+          #  echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV;
 
       - run:
           name: Create environment and contrive to always use it

From d3cbe679218991b85466f0d86ad3af23080443fd Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Fri, 30 Dec 2022 18:49:58 -0800
Subject: [PATCH 10/36] remove comments

---
 {{ cookiecutter.repo_name }}/.circleci/config.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/.circleci/config.yml b/{{ cookiecutter.repo_name }}/.circleci/config.yml
index 10f4984..98373ef 100644
--- a/{{ cookiecutter.repo_name }}/.circleci/config.yml	
+++ b/{{ cookiecutter.repo_name }}/.circleci/config.yml	
@@ -21,14 +21,6 @@ jobs:
     steps:
       - checkout
 
-      #- run:
-          #name: Set up Anaconda
-          #command: |
-          #  wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh;
-          #  chmod +x ~/miniconda.sh;
-          #  ~/miniconda.sh -b -p ~/miniconda;
-          #  echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV;
-
       - run:
           name: Create environment and contrive to always use it
           command: |

From 49153b1c21aab5bdd390b0abc3eb02e40f7f8f11 Mon Sep 17 00:00:00 2001
From: Amy Wooding <36967030+acwooding@users.noreply.github.com>
Date: Fri, 30 Dec 2022 18:52:10 -0800
Subject: [PATCH 11/36] Updated config.yml

---
 .circleci/config.yml | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0e16d21..861f07f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -8,7 +8,7 @@ jobs:
     docker:
       # specify the version you desire here
       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
-      - image: cimg/python:3.8.0
+      - image: continuumio/miniconda3
 
       # Specify service dependencies here if necessary
       # CircleCI maintains a library of pre-built images
@@ -19,19 +19,7 @@ jobs:
 
     steps:
       - checkout
-
-      - run:
-          name: Set up Anaconda
-          command: |
-            wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh;
-            chmod +x ~/miniconda.sh;
-            ~/miniconda.sh -b -p ~/miniconda;
-            export PATH=~/miniconda/bin:$PATH
-            echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV;
-            conda update --yes --quiet conda;
-            conda init bash
-            sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV
-
+      
       - run:
           name: Build cookiecutter environment and test-env project
           command: |

From 0649fc3bdb0f1d101dba5cb7d753bea082a23d7e Mon Sep 17 00:00:00 2001
From: Amy Wooding <36967030+acwooding@users.noreply.github.com>
Date: Fri, 30 Dec 2022 18:58:26 -0800
Subject: [PATCH 12/36] Updated config.yml

---
 .circleci/config.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 861f07f..a0fd328 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -20,6 +20,14 @@ jobs:
     steps:
       - checkout
       
+      - run:
+          name: Set up Conda
+          command: |
+            conda init bash
+            conda update --yes --quiet conda;
+            export CONDA_EXE=/home/circleci/miniconda/bin/conda
+            sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV 
+      
       - run:
           name: Build cookiecutter environment and test-env project
           command: |

From 41907d3650d277cd790f5d4c46ce9c070f9f0a96 Mon Sep 17 00:00:00 2001
From: Amy Wooding <36967030+acwooding@users.noreply.github.com>
Date: Fri, 30 Dec 2022 19:46:48 -0800
Subject: [PATCH 13/36] Updated config.yml

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a0fd328..4bd1897 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -35,7 +35,7 @@ jobs:
             conda activate cookiecutter
             pip install cookiecutter
             pip install ruamel.yaml
-            mkdir /home/circleci/.cookiecutter_replay
+            mkdir -p /home/circleci/.cookiecutter_replay
             cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json
             pwd
             cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input

From 39c611116fb392767224b2c8ac567fd34ceb30b9 Mon Sep 17 00:00:00 2001
From: Amy Wooding <36967030+acwooding@users.noreply.github.com>
Date: Fri, 30 Dec 2022 19:52:21 -0800
Subject: [PATCH 14/36] Updated config.yml

---
 .circleci/config.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4bd1897..95898dd 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -31,7 +31,7 @@ jobs:
       - run:
           name: Build cookiecutter environment and test-env project
           command: |
-            conda create -n cookiecutter --yes python=3.8
+            conda create -n cookiecutter --yes python=3.8 make
             conda activate cookiecutter
             pip install cookiecutter
             pip install ruamel.yaml
@@ -48,6 +48,7 @@ jobs:
             export CONDA_EXE=/home/circleci/miniconda/bin/conda
             make create_environment
             conda activate test-env
+            conda install -c anaconda make
             touch environment.yml
             make update_environment
             echo "conda activate test-env" >> $BASH_ENV;

From 1ba37cd34e4cb1e01e15a22bc7ad28eb4319c574 Mon Sep 17 00:00:00 2001
From: Amy Wooding <36967030+acwooding@users.noreply.github.com>
Date: Fri, 30 Dec 2022 19:54:29 -0800
Subject: [PATCH 15/36] Updated config.yml

---
 .circleci/config.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 95898dd..35b2951 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -39,8 +39,7 @@ jobs:
             cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json
             pwd
             cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input
-            conda deactivate
-
+            
       - run:
           name: Create test-env environment and contrive to always use it
           command: |

From 951b80673b1847b97ebefb4c3ca26464fd5694c2 Mon Sep 17 00:00:00 2001
From: Amy Wooding <36967030+acwooding@users.noreply.github.com>
Date: Fri, 30 Dec 2022 19:56:11 -0800
Subject: [PATCH 16/36] Updated config.yml

---
 .circleci/config.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 35b2951..487814b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -38,11 +38,13 @@ jobs:
             mkdir -p /home/circleci/.cookiecutter_replay
             cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json
             pwd
+            which make
             cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input
             
       - run:
           name: Create test-env environment and contrive to always use it
           command: |
+            conda activate cookiecutter
             cd test-env
             export CONDA_EXE=/home/circleci/miniconda/bin/conda
             make create_environment

From dbe1a1af1e940a6004c0c4487a585a78a4db81bb Mon Sep 17 00:00:00 2001
From: Amy Wooding <36967030+acwooding@users.noreply.github.com>
Date: Fri, 30 Dec 2022 19:59:42 -0800
Subject: [PATCH 17/36] Updated config.yml

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 487814b..a589dd5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -25,6 +25,7 @@ jobs:
           command: |
             conda init bash
             conda update --yes --quiet conda;
+            which conda
             export CONDA_EXE=/home/circleci/miniconda/bin/conda
             sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV 
       

From 9f2d43addd08a7a5d5548850a35a1661fbbb6a82 Mon Sep 17 00:00:00 2001
From: Amy Wooding <36967030+acwooding@users.noreply.github.com>
Date: Fri, 30 Dec 2022 20:02:12 -0800
Subject: [PATCH 18/36] Updated config.yml

---
 .circleci/config.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a589dd5..6c79e63 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -25,8 +25,7 @@ jobs:
           command: |
             conda init bash
             conda update --yes --quiet conda;
-            which conda
-            export CONDA_EXE=/home/circleci/miniconda/bin/conda
+            export CONDA_EXE=/opt/conda/bin/conda
             sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV 
       
       - run:
@@ -36,8 +35,8 @@ jobs:
             conda activate cookiecutter
             pip install cookiecutter
             pip install ruamel.yaml
-            mkdir -p /home/circleci/.cookiecutter_replay
-            cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json
+            mkdir -p /root/repo/.cookiecutter_replay
+            cp circleci-cookiecutter-easydata.json /root/repo/.cookiecutter_replay/cookiecutter-easydata.json
             pwd
             which make
             cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input
@@ -47,7 +46,7 @@ jobs:
           command: |
             conda activate cookiecutter
             cd test-env
-            export CONDA_EXE=/home/circleci/miniconda/bin/conda
+            export CONDA_EXE=/opt/conda/bin/conda
             make create_environment
             conda activate test-env
             conda install -c anaconda make

From ea355648556d12c1518cf80f03622726a2b11e18 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Sat, 31 Dec 2022 10:28:56 -0800
Subject: [PATCH 19/36] update extra -> fileset and descr -> readme

---
 docs/00-xyz-sample-notebook.ipynb |  2 +-
 docs/Add-csv-template.ipynb       | 14 +++++++-------
 docs/Add-derived-dataset.ipynb    | 10 +++++-----
 docs/New-Dataset-Template.ipynb   | 12 ++++++------
 docs/New-Edge-Template.ipynb      |  4 ++--
 docs/test_docs.py                 |  3 +++
 6 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/docs/00-xyz-sample-notebook.ipynb b/docs/00-xyz-sample-notebook.ipynb
index a089002..cc90381 100644
--- a/docs/00-xyz-sample-notebook.ipynb
+++ b/docs/00-xyz-sample-notebook.ipynb
@@ -150,7 +150,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {
diff --git a/docs/Add-csv-template.ipynb b/docs/Add-csv-template.ipynb
index ad69434..ad1e37d 100644
--- a/docs/Add-csv-template.ipynb
+++ b/docs/Add-csv-template.ipynb
@@ -83,7 +83,7 @@
     "* `csv_path`: The desired path to your .csv file (in this case `epidemiology.csv`) relative to paths['raw_data_path']\n",
     "* `download_message`: The message to display to indicate to the user how to manually download your .csv file.\n",
     "* `license_str`: Information on the license for the dataset\n",
-    "* `descr_str`: Information on the dataset itself"
+    "* `readme_str`: Information on the dataset itself"
    ]
   },
   {
@@ -123,7 +123,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "descr_str = \"\"\"\n",
+    "readme_str = \"\"\"\n",
     "The epidemiology table from Google's [COVID-19 Open-Data dataset](https://github.com/GoogleCloudPlatform/covid-19-open-data). \n",
     "\n",
     "The full dataset contains datasets of daily time-series data related to COVID-19 for over 20,000 distinct locations around the world. The data is at the spatial resolution of states/provinces for most regions and at county/municipality resolution for many countries such as Argentina, Brazil, Chile, Colombia, Czech Republic, Mexico, Netherlands, Peru, United Kingdom, and USA. All regions are assigned a unique location key, which resolves discrepancies between ISO / NUTS / FIPS codes, etc. The different aggregation levels are:\n",
@@ -170,7 +170,7 @@
     "                                               csv_path=csv_path,\n",
     "                                               download_message=download_message,\n",
     "                                               license_str=license_str,\n",
-    "                                               descr_str=descr_str,\n",
+    "                                               readme_str=readme_str,\n",
     "                                               overwrite_catalog=True)"
    ]
   },
@@ -206,9 +206,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to  in `ds.EXTRA`.\n",
+    "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to  in `ds.FILESET`.\n",
     "\n",
-    "The `.EXTRA` functionality is covered in other documentation."
+    "The `.FILESET` functionality is covered in other documentation."
    ]
   },
   {
@@ -236,7 +236,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds_raw.EXTRA"
+    "ds_raw.FILESET"
    ]
   },
   {
@@ -246,7 +246,7 @@
    "outputs": [],
    "source": [
     "# fq path to epidemiology.csv file\n",
-    "ds_raw.extra_file('epidemiology.csv')"
+    "ds_raw.fileset_file('epidemiology.csv')"
    ]
   },
   {
diff --git a/docs/Add-derived-dataset.ipynb b/docs/Add-derived-dataset.ipynb
index e639190..d5e93e4 100644
--- a/docs/Add-derived-dataset.ipynb
+++ b/docs/Add-derived-dataset.ipynb
@@ -85,7 +85,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {
@@ -219,7 +219,7 @@
     "    source_dataset_name\n",
     "    dataset_name\n",
     "    data_function\n",
-    "    added_descr_txt\n",
+    "    added_readme_txt\n",
     "\n",
     "We'll want our `data_function` to be defined in the project module (in this case `src`) for reproducibility reasons (which we've already done with `subselect_by_key` above)."
    ]
@@ -250,7 +250,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "added_descr_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n",
+    "added_readme_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n",
     "to the {key} dataset.\"\"\""
    ]
   },
@@ -281,7 +281,7 @@
     "        source_dataset_name=source_dataset_name,\n",
     "        dataset_name=dataset_name,\n",
     "        data_function=data_function,\n",
-    "        added_descr_txt=added_descr_txt,\n",
+    "        added_readme_txt=added_readme_txt,\n",
     "        overwrite_catalog=True)"
    ]
   },
@@ -318,7 +318,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {
diff --git a/docs/New-Dataset-Template.ipynb b/docs/New-Dataset-Template.ipynb
index bcf7826..abb8e88 100644
--- a/docs/New-Dataset-Template.ipynb
+++ b/docs/New-Dataset-Template.ipynb
@@ -167,7 +167,7 @@
    "metadata": {},
    "source": [
     "### Create a process function\n",
-    "By default, we recommend that you use the `process_extra_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own."
+    "By default, we recommend that you use the `process_fileset_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own."
    ]
   },
   {
@@ -176,11 +176,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.data.extra import process_extra_files\n",
-    "process_function = process_extra_files\n",
+    "from src.data.fileset import process_fileset_files\n",
+    "process_function = process_fileset_files\n",
     "process_function_kwargs = {'file_glob':'*.csv',\n",
     "                           'do_copy': True,\n",
-    "                           'extra_dir': ds_name+'.extra',\n",
+    "                           'fileset_dir': ds_name+'.fileset',\n",
     "                           'extract_dir': ds_name}"
    ]
   },
@@ -355,7 +355,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds.EXTRA"
+    "ds.FILESET"
    ]
   },
   {
@@ -364,7 +364,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds.extra_file('epidemiology.csv')"
+    "ds.fileset_file('epidemiology.csv')"
    ]
   },
   {
diff --git a/docs/New-Edge-Template.ipynb b/docs/New-Edge-Template.ipynb
index 6a1c5bb..3b1058e 100644
--- a/docs/New-Edge-Template.ipynb
+++ b/docs/New-Edge-Template.ipynb
@@ -88,7 +88,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "source_ds.EXTRA"
+    "source_ds.FILESET"
    ]
   },
   {
@@ -178,7 +178,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {
diff --git a/docs/test_docs.py b/docs/test_docs.py
index 045cc56..2eb7922 100644
--- a/docs/test_docs.py
+++ b/docs/test_docs.py
@@ -9,6 +9,8 @@
 import requests
 
 from src import paths
+from src.log import logger
+
 
 CCDS_ROOT = Path(__file__).parents[1].resolve()
 DOCS_DIR = CCDS_ROOT / "docs"
@@ -35,6 +37,7 @@ def test_notebook_csv(self):
         csv_url = "https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv"
         csv_dest = paths['raw_data_path'] / "epidemiology.csv"
         if not csv_dest.exists():
+            logger.DEBUG("Downloading epidemiology.csv")
             csv_file = requests.get(csv_url)
             with open(csv_dest, 'wb') as f:
                 f.write(csv_file.content)

From 0be828200d8823a1cbbe2c1254826435a30f550d Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Sat, 31 Dec 2022 11:34:02 -0800
Subject: [PATCH 20/36] change to lowercase

---
 docs/test_docs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/test_docs.py b/docs/test_docs.py
index 2eb7922..7e8d17a 100644
--- a/docs/test_docs.py
+++ b/docs/test_docs.py
@@ -37,7 +37,7 @@ def test_notebook_csv(self):
         csv_url = "https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv"
         csv_dest = paths['raw_data_path'] / "epidemiology.csv"
         if not csv_dest.exists():
-            logger.DEBUG("Downloading epidemiology.csv")
+            logger.debug("Downloading epidemiology.csv")
             csv_file = requests.get(csv_url)
             with open(csv_dest, 'wb') as f:
                 f.write(csv_file.content)

From 8920b93fc4a1edb19c200b99c274668418646dd7 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 25 Jan 2023 10:37:53 -0500
Subject: [PATCH 21/36] handle arbitraty conda channels

---
 {{ cookiecutter.repo_name }}/Makefile.envs    | 25 +++---
 .../scripts/split_pip.py                      | 86 ++++++++++++-------
 2 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/Makefile.envs b/{{ cookiecutter.repo_name }}/Makefile.envs
index 4c65eb7..5723f76 100644
--- a/{{ cookiecutter.repo_name }}/Makefile.envs	
+++ b/{{ cookiecutter.repo_name }}/Makefile.envs	
@@ -4,28 +4,22 @@
 
 include Makefile.include
 
-$(LOCKFILE): check_installation .make.bootstrap .make.pip-requirements.txt .make.environment-default.yml .make.conda-forge-requirements.txt
+$(LOCKFILE): check_installation .make.bootstrap split_environment_files
 ifeq (conda, $(VIRTUALENV))
-	$(CONDA_EXE) env update -n $(PROJECT_NAME) -f .make.environment-default.yml --prune
-	$(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.conda-forge-requirements.txt --channel defaults --channel conda-forge --strict-channel-priority --yes
+	for channel in $(shell $(CAT) .make.channel-order.include); do\
+	   $(ECHO) installing from .make.$$channel-environment.txt;\
+	   $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.$$channel-environment.txt --channel defaults --channel $$channel --strict-channel-priority --yes ;\
+	done
 	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture pip install -r .make.pip-requirements.txt
 	$(CONDA_EXE) env export -n $(PROJECT_NAME) -f $(LOCKFILE)
 else
 	$(error Unsupported Environment `$(VIRTUALENV)`. Use conda)
 endif
 
-# extract multi-phase dependencies from environment.yml
-.make.environment-pip.yml: environment.yml .make.bootstrap
-	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py pip-yaml $(PROJECT_DIR)environment.yml > $@
-
-.make.pip-requirements.txt: environment.yml .make.bootstrap
-	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py pip $(PROJECT_DIR)environment.yml > $@
-
-.make.conda-forge-requirements.txt: environment.yml .make.bootstrap
-	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py conda-forge $(PROJECT_DIR)environment.yml > $@
-
-.make.environment-default.yml: environment.yml .make.bootstrap
-	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py default $(PROJECT_DIR)environment.yml > $@
+.PHONY: split_environment_files
+## extract multi-phase dependencies from environment.yml and create ordering file
+split_environment_files: environment.yml .make.bootstrap
+	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py $(PROJECT_DIR)environment.yml
 
 .make.bootstrap: scripts/bootstrap.yml
 	$(CONDA_EXE) env update -n $(PROJECT_NAME) -f scripts/bootstrap.yml
@@ -69,6 +63,7 @@ endif
 # Checks that the conda environment is active
 environment_enabled:
 ifeq (conda,$(VIRTUALENV))
+	$(CONDA_EXE) config --env --set channel_priority strict
 ifneq ($(notdir ${CONDA_DEFAULT_ENV}), $(PROJECT_NAME))
 	$(error Run "$(VIRTUALENV) activate $(PROJECT_NAME)" before proceeding...)
 endif
diff --git a/{{ cookiecutter.repo_name }}/scripts/split_pip.py b/{{ cookiecutter.repo_name }}/scripts/split_pip.py
index ecdc987..53e04fc 100644
--- a/{{ cookiecutter.repo_name }}/scripts/split_pip.py	
+++ b/{{ cookiecutter.repo_name }}/scripts/split_pip.py	
@@ -2,13 +2,19 @@
 import json
 import sys
 import yaml
+from collections import defaultdict
 
-ACCEPTABLE_FORMATS = ["default", "pip", "pip-yaml", "conda-forge"]
 
-def env_split(conda_env, kind="default"):
-    """Given a conda_environment dict, split into pip/nonpip versions
+def env_split(conda_env, channel_order):
+    """Given a conda_environment dict, and a channel order, split into versions for each channel.
+
+    Returns:
+
+    conda_env: (list)
+       remaining setup bits of the environment.yml file
+    channel_dict: (dict)
+       dict containing the list of dependencies by channel name
 
-    conda_env: dict
         Python object corresponding to environment.yml"""
     # Cheater way to make deep Copies
     json_copy = json.dumps(conda_env)
@@ -17,49 +23,63 @@ def env_split(conda_env, kind="default"):
 
     pipdeps = None
     deplist = conda_env.pop('dependencies')
-    conda_forge_list = []
+    channel_dict = defaultdict(list)
 
     for k, dep in enumerate(deplist[:]):  # Note: copy list, as we mutate it
         if isinstance(dep, dict):  # nested yaml
             if dep.get('pip', None):
-                pipdeps = ["pip", deplist.pop(k)]
+                channel_dict['pip'] = deplist.pop(k)
         else:
-            prefix = 'conda-forge::'
-            if dep.startswith(prefix):
-                conda_forge_list.append(dep[len(prefix):])
+            prefix_check = dep.split('::')
+            if len(prefix_check) > 1:
+                channel = prefix_check[0]
+                if not channel in channel_order:
+                    raise Exception(f'the channel {channel} required for {dep} is not specified in a channel-order section of the environment file')
+                channel_dict[f'{channel}'].append(prefix_check[1])
                 deplist.remove(dep)
 
-    conda_env['dependencies'] = deplist
-    pip_env['dependencies'] = pipdeps
-    return conda_env, pip_env, conda_forge_list
+    channel_dict['defaults'] = deplist
+    conda_env.pop('channel-order')
+    return conda_env, channel_dict
+
+def get_channel_order(conda_env):
+    """
+    Given a conda_environment dict, get the channels from the channel order.
+    """
+    channel_order = conda_env.get('channel-order')
+
+    if channel_order is None:
+        channel_order = ['defaults']
+    if not 'defaults' in channel_order:
+        channel_order.insert(0, 'defaults')
+    channel_order.append('pip')
+    return channel_order
 
 def usage():
     print(f"""
-Usage:    split_pip.py [{"|".join(ACCEPTABLE_FORMATS)}] path/to/environment.yml
+Usage:    split_pip.py path/to/environment.yml
     """)
 if __name__ == '__main__':
-    if len(sys.argv) != 3:
-        usage()
-        exit(1)
-
-    kind = sys.argv[1]
-    if kind not in ACCEPTABLE_FORMATS:
+    if len(sys.argv) != 2:
         usage()
         exit(1)
 
-    with open(sys.argv[2], 'r') as yamlfile:
+    with open(sys.argv[1], 'r') as yamlfile:
         conda_env = yaml.safe_load(yamlfile)
 
-    cenv, penv, forgelist = env_split(conda_env)
-    if kind == "pip-yaml":
-        _ = yaml.dump(penv, sys.stdout, allow_unicode=True, default_flow_style=False)
-    elif kind == "pip":
-        print("\n".join(penv["dependencies"].pop(-1)["pip"]))
-    elif kind == "pip-yaml":
-        _ = yaml.dump(penv, sys.stdout, allow_unicode=True, default_flow_style=False)
-    elif kind == "default":
-        _ = yaml.dump(cenv, sys.stdout, allow_unicode=True, default_flow_style=False)
-    elif kind == "conda-forge":
-        print("\n".join(forgelist))
-    else:
-        raise Exception(f"Invalid Kind: {kind}")
+    #check for acceptable formats
+    channel_order = get_channel_order(conda_env)
+    with open('.make.channel-order.include', 'w') as f:
+        f. write(' '.join(channel_order[:-1])) #exclude pip as a channel here
+
+    cenv, channel_dict = env_split(conda_env, channel_order)
+
+    for kind in channel_order:
+        if kind == "pip":
+            filename = '.make.pip-requirements.txt'
+            with open(filename, 'w') as f:
+                f.write("\n".join(channel_dict['pip']['pip']))
+        else:
+            filename = f'.make.{kind}-environment.txt'
+            with open(filename, 'w') as f:
+                f.write("\n".join(channel_dict[kind]))

From baa9fa2fa54e8ba5a0a03ea2fdd82019fc3ec305 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 25 Jan 2023 10:38:14 -0500
Subject: [PATCH 22/36] use the template python version

---
 {{ cookiecutter.repo_name }}/scripts/bootstrap.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml
index c52f026..4997352 100644
--- a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml	
+++ b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml	
@@ -1,5 +1,13 @@
+{% macro pyver() -%}
+{% if cookiecutter.python_version == 'latest' -%}
+  - python=3
+{% else -%}
+  - python={{ cookiecutter.python_version }}
+{% endif -%}
+{% endmacro -%}
+name: {{ cookiecutter.repo_name }}
 channels:
     - defaults
 dependencies:
-    - python=3.7
     - pyyaml
+{{ pyver()|indent(2, true) }}

From 09c1fd4dfa3507535822baa1bce9434fb70e446b Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 25 Jan 2023 10:51:59 -0500
Subject: [PATCH 23/36] add test of the new environment code

---
 .circleci/config.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 6c79e63..3bf0b03 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -40,12 +40,13 @@ jobs:
             pwd
             which make
             cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input
-            
+
       - run:
           name: Create test-env environment and contrive to always use it
           command: |
             conda activate cookiecutter
             cd test-env
+            python scripts/tests/add-extra-channel-dependency.py
             export CONDA_EXE=/opt/conda/bin/conda
             make create_environment
             conda activate test-env

From ceb636b39eb37eb719f0915fd8dc633b270b38b4 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 25 Jan 2023 10:56:31 -0500
Subject: [PATCH 24/36] add missing file

---
 .../scripts/tests/add-extra-channel-dependency.py  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 {{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py

diff --git a/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py
new file mode 100644
index 0000000..c615a4e
--- /dev/null
+++ b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py	
@@ -0,0 +1,14 @@
+import sys
+import yaml
+
+
+if __name__ == "__main__":
+    channel_order = ['defaults', 'pytorch']
+    dependency_new = "pytorch::cpuonly"
+
+    with open("environment.yml", "rt", encoding="utf-8") as file_env:
+        env = yaml.safe_load(file_env)
+    env["dependencies"].append(dependency_new)
+    env["channel_order"] = channel_order
+    with open("environment.yml", "wt", encoding="utf-8") as file_env:
+        yaml.safe_dump(env, file_env)

From 9739191f857dff2bc8d420b98dbe449e556df651 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 25 Jan 2023 11:08:37 -0500
Subject: [PATCH 25/36] fix indentation

---
 .../scripts/bootstrap.yml                     |  6 ++--
 .../test-environment.yml                      | 33 +++++++++++++++++++
 2 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 {{ cookiecutter.repo_name }}/test-environment.yml

diff --git a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml
index 4997352..20cd12d 100644
--- a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml	
+++ b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml	
@@ -7,7 +7,7 @@
 {% endmacro -%}
 name: {{ cookiecutter.repo_name }}
 channels:
-    - defaults
+   - defaults
 dependencies:
-    - pyyaml
-{{ pyver()|indent(2, true) }}
+   - pyyaml
+{{ pyver()|indent(3, true) }}
diff --git a/{{ cookiecutter.repo_name }}/test-environment.yml b/{{ cookiecutter.repo_name }}/test-environment.yml
new file mode 100644
index 0000000..9845bc6
--- /dev/null
+++ b/{{ cookiecutter.repo_name }}/test-environment.yml	
@@ -0,0 +1,33 @@
+channel_order:
+- defaults
+- pytorch
+channels:
+- defaults
+dependencies:
+- pip
+- pip:
+  - -e .
+  - python-dotenv>=0.5.1
+  - nbval
+  - nbdime
+  - gdown
+- setuptools
+- wheel
+- git>=2.5
+- sphinx
+- bokeh
+- click
+- colorcet
+- coverage
+- coveralls
+- matplotlib
+- jupyter
+- scikit-learn
+- scipy
+- joblib
+- nb_conda_kernels
+- pandas
+- requests
+- pathlib
+- fsspec
+- pytorch::cpuonly

From 25e41e37a0d59fa7241e8f35807b9aefe8a0dd21 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 25 Jan 2023 11:12:04 -0500
Subject: [PATCH 26/36] fix typo and remove test file

---
 .../scripts/tests/add-extra-channel-dependency.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py
index c615a4e..8c41a6b 100644
--- a/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py	
+++ b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py	
@@ -9,6 +9,6 @@
     with open("environment.yml", "rt", encoding="utf-8") as file_env:
         env = yaml.safe_load(file_env)
     env["dependencies"].append(dependency_new)
-    env["channel_order"] = channel_order
+    env["channel-order"] = channel_order
     with open("environment.yml", "wt", encoding="utf-8") as file_env:
         yaml.safe_dump(env, file_env)

From f0f548134adedc03b6fdd38ffabe7478b177e25c Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 25 Jan 2023 11:25:17 -0500
Subject: [PATCH 27/36] for latest, defautl to conda latest python version and
 remove test file

---
 {{ cookiecutter.repo_name }}/environment.yml  |  2 +-
 .../scripts/bootstrap.yml                     |  2 +-
 .../test-environment.yml                      | 33 -------------------
 3 files changed, 2 insertions(+), 35 deletions(-)
 delete mode 100644 {{ cookiecutter.repo_name }}/test-environment.yml

diff --git a/{{ cookiecutter.repo_name }}/environment.yml b/{{ cookiecutter.repo_name }}/environment.yml
index 6749871..5982a14 100644
--- a/{{ cookiecutter.repo_name }}/environment.yml	
+++ b/{{ cookiecutter.repo_name }}/environment.yml	
@@ -1,6 +1,6 @@
 {% macro pyver() -%}
 {% if cookiecutter.python_version == 'latest' -%}
-  - python=3
+  - python
 {% else -%}
   - python={{ cookiecutter.python_version }}
 {% endif -%}
diff --git a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml
index 20cd12d..d0e5cc0 100644
--- a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml	
+++ b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml	
@@ -1,6 +1,6 @@
 {% macro pyver() -%}
 {% if cookiecutter.python_version == 'latest' -%}
-  - python=3
+  - python
 {% else -%}
   - python={{ cookiecutter.python_version }}
 {% endif -%}
diff --git a/{{ cookiecutter.repo_name }}/test-environment.yml b/{{ cookiecutter.repo_name }}/test-environment.yml
deleted file mode 100644
index 9845bc6..0000000
--- a/{{ cookiecutter.repo_name }}/test-environment.yml	
+++ /dev/null
@@ -1,33 +0,0 @@
-channel_order:
-- defaults
-- pytorch
-channels:
-- defaults
-dependencies:
-- pip
-- pip:
-  - -e .
-  - python-dotenv>=0.5.1
-  - nbval
-  - nbdime
-  - gdown
-- setuptools
-- wheel
-- git>=2.5
-- sphinx
-- bokeh
-- click
-- colorcet
-- coverage
-- coveralls
-- matplotlib
-- jupyter
-- scikit-learn
-- scipy
-- joblib
-- nb_conda_kernels
-- pandas
-- requests
-- pathlib
-- fsspec
-- pytorch::cpuonly

From b22f548c11593b291a2920933d5c62e2e3385a34 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 25 Jan 2023 11:41:25 -0500
Subject: [PATCH 28/36] handle situation where channel-order doesn't exist

---
 .circleci/config.yml                              | 2 +-
 {{ cookiecutter.repo_name }}/scripts/split_pip.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3bf0b03..788c38a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -46,9 +46,9 @@ jobs:
           command: |
             conda activate cookiecutter
             cd test-env
-            python scripts/tests/add-extra-channel-dependency.py
             export CONDA_EXE=/opt/conda/bin/conda
             make create_environment
+            python scripts/tests/add-extra-channel-dependency.py
             conda activate test-env
             conda install -c anaconda make
             touch environment.yml
diff --git a/{{ cookiecutter.repo_name }}/scripts/split_pip.py b/{{ cookiecutter.repo_name }}/scripts/split_pip.py
index 53e04fc..62d059c 100644
--- a/{{ cookiecutter.repo_name }}/scripts/split_pip.py	
+++ b/{{ cookiecutter.repo_name }}/scripts/split_pip.py	
@@ -39,7 +39,7 @@ def env_split(conda_env, channel_order):
                 deplist.remove(dep)
 
     channel_dict['defaults'] = deplist
-    conda_env.pop('channel-order')
+    conda_env.pop('channel-order', None)
     return conda_env, channel_dict
 
 def get_channel_order(conda_env):

From 756485e283d90af3cc140913bf0e9a80cf2180c0 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Mon, 30 Jan 2023 21:54:11 -0500
Subject: [PATCH 29/36] use a windows friendly loop

---
 {{ cookiecutter.repo_name }}/Makefile.envs    | 6 ++----
 {{ cookiecutter.repo_name }}/Makefile.include | 1 +
 {{ cookiecutter.repo_name }}/Makefile.win32   | 1 +
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/Makefile.envs b/{{ cookiecutter.repo_name }}/Makefile.envs
index 5723f76..02feb89 100644
--- a/{{ cookiecutter.repo_name }}/Makefile.envs	
+++ b/{{ cookiecutter.repo_name }}/Makefile.envs	
@@ -6,10 +6,8 @@ include Makefile.include
 
 $(LOCKFILE): check_installation .make.bootstrap split_environment_files
 ifeq (conda, $(VIRTUALENV))
-	for channel in $(shell $(CAT) .make.channel-order.include); do\
-	   $(ECHO) installing from .make.$$channel-environment.txt;\
-	   $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.$$channel-environment.txt --channel defaults --channel $$channel --strict-channel-priority --yes ;\
-	done
+	$(foreach channel, $(shell $(CAT) .make.channel-order.include),\
+	   $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.$(channel)-environment.txt --channel defaults --channel $(channel) --strict-channel-priority --yes $(CMDSEP))
 	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture pip install -r .make.pip-requirements.txt
 	$(CONDA_EXE) env export -n $(PROJECT_NAME) -f $(LOCKFILE)
 else
diff --git a/{{ cookiecutter.repo_name }}/Makefile.include b/{{ cookiecutter.repo_name }}/Makefile.include
index e8486ca..fc65727 100644
--- a/{{ cookiecutter.repo_name }}/Makefile.include	
+++ b/{{ cookiecutter.repo_name }}/Makefile.include	
@@ -19,5 +19,6 @@ CAT ?= cat
 SET ?= export
 WHICH ?= which
 DEVNULL ?= /dev/null
+CMDSEP ?= ;
 
 $(warning From here on, using SHELL = $(SHELL))
diff --git a/{{ cookiecutter.repo_name }}/Makefile.win32 b/{{ cookiecutter.repo_name }}/Makefile.win32
index 92d8800..de046eb 100644
--- a/{{ cookiecutter.repo_name }}/Makefile.win32	
+++ b/{{ cookiecutter.repo_name }}/Makefile.win32	
@@ -5,6 +5,7 @@ CAT = type
 SET = set
 WHICH = where
 DEVNULL = nul
+CMDSEP = &
 
 # Some UNIXish packages force the installation of a Bourne-compatible shell, and Make
 # prefers using this when it sees it. We thus force the usage of the good ole Batch

From 6e69de1eff50e7931978ac951916e26fdb18d776 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 1 Feb 2023 11:01:34 -0500
Subject: [PATCH 30/36] update environment management instructions to include
 channel order and arbitrary channel use

---
 .../reference/easydata/conda-environments.md  | 40 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md
index 724d131..60a9a9f 100644
--- a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md	
+++ b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md	
@@ -81,6 +81,7 @@ When adding packages to your python environment, **do not `pip install` or `cond
 Your `environment.yml` file will look something like this:
 ```
 name: {{ cookiecutter.repo_name }}
+dependencies:
   - pip
   - pip:
     - -e .  # conda >= 4.4 only
@@ -106,7 +107,7 @@ name: {{ cookiecutter.repo_name }}
 ```
 To add any package available from conda, add it to the end of the list. If you have a PYPI dependency that's not avaible via conda, add it to the list of pip installable dependencies under `  - pip:`.
 
-You can include any {{ cookiecutter.upstream_location }} python-based project in the `pip` section via `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>`.
+You can include any `{{ cookiecutter.upstream_location }}` python-based project in the `pip` section via `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>`.
 
 In particular, if you're working off of a fork or a work in progress branch of a repo in {{ cookiecutter.upstream_location }} (say, your personal version of <package>), you can change `git+https://{{ cookiecutter.upstream_location }}/<my_git_handle>/<package>` to
 
@@ -117,6 +118,43 @@ Once you're done your edits, run `make update_environment` and voila, you're upd
 
 To share your updated environment, check in your `environment.yml` file. (More on this in [Sharing your Work](sharing-your-work.md))
 
+#### Adding packages from other conda channels
+Say we want to add a package only available from the `conda-forge` conda channel and not the default conda channel. (The conda channel is what follows `-c` when using `conda install -c my-channel my-package`. Suppose we want to use `make` on windows. Then we need to use `conda-forge` since the default conda channel only has linux and macOS installations of `make`. To normally conda install this, we would use `conda install -c conda-forge make`. **We won't do that here**.
+
+Instead, we add a `channel-order` section that starts with `defaults` and lists the other channels we want to use in the order we want to install from them (note that this is a custom EasyData section to the `environment.yml`). Then we add our package in the dependency list in the form `channel-name::package-name`, for example, `conda-forge::make`.
+
+In this case an updated `environment.yml` file looks like this:
+```
+name: {{ cookiecutter.repo_name }}
+channel-order:
+  - defaults
+  - conda-forge
+dependencies:
+  - pip
+  - pip:
+    - -e .  # conda >= 4.4 only
+    - python-dotenv>=0.5.1
+    - nbval
+    - nbdime
+    - umap-learn
+    - gdown
+  - setuptools
+  - wheel
+  - git>=2.5  # for git worktree template updating
+  - sphinx
+  - bokeh
+  - click
+  - colorcet
+  - coverage
+  - coveralls
+  - datashader
+  - holoviews
+  - matplotlib
+  - jupyter
+  - conda-forge::make
+...
+```
+
 
 #### Lock files
 Now, we'll admit that this workflow isn't perfectly reproducible in the sense that conda still has to resolve versions from the `environment.yml`. To make it more reproducible, running either `make create_environment` or `make update_environment` will generate an `environment.{$ARCH}.lock.yml` (e.g. `environment.i386.lock.yml`). This file keeps a record of the exact environment that is currently installed in your conda environment `{{ cookiecutter.repo_name }}`. If you ever need to reproduce an environment exactly, you can install from the `.lock.yml` file. (Note: These are architecture dependent).

From 87159f708fb817b0e2b5be5004d5d6a226efedd2 Mon Sep 17 00:00:00 2001
From: Amy Wooding <36967030+acwooding@users.noreply.github.com>
Date: Wed, 1 Feb 2023 11:29:02 -0500
Subject: [PATCH 31/36] Update README.md

Include references to documentation and where to look for more information
---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 2f2a732..7497c45 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,24 @@ python -m pip install -f requirements.txt
 
     cookiecutter https://github.com/hackalog/easydata
 
+### To find out more
+------------
+A good place to start is with reproducible environments. We have a tutorial here: [Getting Started with EasyData Environments](https://github.com/hackalog/easydata/wiki/Getting-Started-with-EasyData-Environments). 
+
+The next place to look is in the customized documentation that is in any EasyData created repo. It is customized to the settings that you put in your template. These are reference documents that can be found under `references/easydata` that are customized to your repo that cover:
+   * more on conda environments
+   * more on paths
+   * git configuration (including setting up ssh with GitHub)
+   * git workflows
+   * tricks for using Jupyter notebooks in an EasyData environment
+   * troubleshooting
+   * recommendations for how to share your work
+   
+Furthermore, see:
+* [The EasyData documentation on read the docs](https://cookiecutter-easydata.readthedocs.io/en/latest/?badge=latest): this contains up-to-date working exmaples of how to use EasyData for reproducible datasets and some ways to use notebooks reproducibly
+* [Talks and Tutorials based on EasyData](https://github.com/hackalog/easydata/wiki/EasyData-Talks-and-Tutorials)
+* [Catalog of EasyData Documentation](https://github.com/hackalog/easydata/wiki/Catalog-of-EasyData-Documentation)
+* [The EasyData wiki](https://github.com/hackalog/easydata/wiki) Check here for further troubleshooting and how-to guides for particular problems that aren't in the `references/easydata` docs (including a `git` tutorial)
 
 ### The resulting directory structure
 ------------

From 4b541c9df22179400154d1690c709fe0fa29c3c6 Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 1 Feb 2023 11:40:18 -0500
Subject: [PATCH 32/36] remove travis.ci testing

---
 .travis.yml | 51 ---------------------------------------------------
 1 file changed, 51 deletions(-)
 delete mode 100644 .travis.yml

diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index b110146..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-language: python
-
-cache:
-  directories:
-  - $HOME/.cache/pip
-
-python:
-  - "3.8"
-
-envs:
-  - REQUIRED_PYTHON="python3"
-
-install:
-  # install miniconda
-  - deactivate
-  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
-  - MINICONDA_PATH=/home/travis/miniconda3
-  - chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
-  - chmod +x $MINICONDA_PATH
-  - export PATH=$MINICONDA_PATH/condabin:$PATH
-  - conda update --yes conda
-  # create cookiecutter environment
-  - conda create -n cookiecutter --yes python=3.8
-  - conda init bash
-  - . ~/.bashrc
-  - conda activate cookiecutter
-  - pip install cookiecutter
-  - pip install ruamel.yaml
-
-script:
-  - pwd
-  # build a cookiecutter project test-env
-  - cookiecutter --config-file .cookiecutter-easydata-test.yml . -f --no-input
-  - conda deactivate
-  # create the environment from test-env
-  - cd test-env
-  - make create_environment
-  - conda activate test-env
-  - touch environment.yml
-  - make update_environment
-  # create test dataset
-  - python src/tests/make_test_datasets.py
-  # run tests on the src module
-  - export CI_RUNNING=yes
-  - make test_with_coverage
-  # test notebooks in docs
-  - pytest -v ../docs/test_docs.py
-
-after_success:
-  - conda activate test-env
-  - coveralls
\ No newline at end of file

From d233bfbbf03b162054aeca515d64cf9397a079c6 Mon Sep 17 00:00:00 2001
From: Kjell Wooding <kjell@wooding.org>
Date: Wed, 1 Feb 2023 11:50:55 -0500
Subject: [PATCH 33/36] fix help messages. Some of these should not display

---
 {{ cookiecutter.repo_name }}/Makefile      | 2 +-
 {{ cookiecutter.repo_name }}/Makefile.envs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile
index addf322..2533593 100644
--- a/{{ cookiecutter.repo_name }}/Makefile	
+++ b/{{ cookiecutter.repo_name }}/Makefile	
@@ -75,7 +75,7 @@ test: update_environment
 		$(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \
 		$(MODULE_NAME)
 
-## Run all Unit Tests with coverage
+## Run all Unit and code coverage tests
 test_with_coverage: update_environment
 	$(SET) LOGLEVEL=DEBUG; coverage run -m pytest --pyargs --doctest-modules --doctest-continue-on-failure --verbose \
 		$(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \
diff --git a/{{ cookiecutter.repo_name }}/Makefile.envs b/{{ cookiecutter.repo_name }}/Makefile.envs
index 02feb89..43396df 100644
--- a/{{ cookiecutter.repo_name }}/Makefile.envs	
+++ b/{{ cookiecutter.repo_name }}/Makefile.envs	
@@ -15,7 +15,7 @@ else
 endif
 
 .PHONY: split_environment_files
-## extract multi-phase dependencies from environment.yml and create ordering file
+# extract multi-phase dependencies from environment.yml and create ordering file
 split_environment_files: environment.yml .make.bootstrap
 	$(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py $(PROJECT_DIR)environment.yml
 

From 7c6b736d80fd081c55c4d1e58a19044f5d24e294 Mon Sep 17 00:00:00 2001
From: Kjell Wooding <kjell@wooding.org>
Date: Wed, 1 Feb 2023 11:51:10 -0500
Subject: [PATCH 34/36] remove lint target. We don't currently use this

---
 {{ cookiecutter.repo_name }}/Makefile | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile
index 2533593..40d2c52 100644
--- a/{{ cookiecutter.repo_name }}/Makefile	
+++ b/{{ cookiecutter.repo_name }}/Makefile	
@@ -81,11 +81,6 @@ test_with_coverage: update_environment
 		$(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \
 		$(MODULE_NAME)
 
-.PHONY: lint
-## Lint using flake8
-lint:
-	flake8 $(MODULE_NAME)
-
 .phony: help_update_easydata
 help_update_easydata:
 	@$(PYTHON_INTERPRETER) scripts/help-update.py

From 223c1fb62b9e8e149fef21ef00ff16d00e7baca1 Mon Sep 17 00:00:00 2001
From: Kjell Wooding <kjell@wooding.org>
Date: Wed, 1 Feb 2023 11:51:28 -0500
Subject: [PATCH 35/36] change this warning to a variable on the standard help
 page

---
 {{ cookiecutter.repo_name }}/Makefile         | 2 +-
 {{ cookiecutter.repo_name }}/Makefile.include | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile
index 40d2c52..15ba76e 100644
--- a/{{ cookiecutter.repo_name }}/Makefile	
+++ b/{{ cookiecutter.repo_name }}/Makefile	
@@ -100,7 +100,7 @@ debug:
 # Self Documenting Commands                                                     #
 #################################################################################
 
-HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM
+HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM SHELL
 
 .DEFAULT_GOAL := show-help
 .PHONY: show-help
diff --git a/{{ cookiecutter.repo_name }}/Makefile.include b/{{ cookiecutter.repo_name }}/Makefile.include
index fc65727..85854ee 100644
--- a/{{ cookiecutter.repo_name }}/Makefile.include	
+++ b/{{ cookiecutter.repo_name }}/Makefile.include	
@@ -20,5 +20,3 @@ SET ?= export
 WHICH ?= which
 DEVNULL ?= /dev/null
 CMDSEP ?= ;
-
-$(warning From here on, using SHELL = $(SHELL))

From c29fed27f48d327ef722d25d8c6d1bf2829aca8f Mon Sep 17 00:00:00 2001
From: Amy Wooding <amy@wooding.org>
Date: Wed, 1 Feb 2023 12:02:07 -0500
Subject: [PATCH 36/36] modernize the template

---
 cookiecutter.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cookiecutter.json b/cookiecutter.json
index d411e76..cf3153e 100644
--- a/cookiecutter.json
+++ b/cookiecutter.json
@@ -1,12 +1,12 @@
 {
     "project_name": "project_name",
     "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}",
-    "default_branch": ["master", "main"],
+    "default_branch": ["main", "master"],
     "module_name": "src",
-    "author_name": "Your name (or your organization/company/team)",
+    "author_name": "Your name (or the copyright holder)",
     "description": "A short description of this project.",
     "open_source_license": ["MIT", "BSD-2-Clause", "Proprietary"],
-    "python_version": ["3.7", "3.6", "latest", "3.8"],
+    "python_version": ["latest", "3.11", "3.10", "3.9", "3.8", "3.7"],
     "conda_path": "~/anaconda3/bin/conda",
     "upstream_location": ["github.com", "gitlab.com", "bitbucket.org", "your-custom-repo"]
 }