From db25e0e1387e902a6559f41c439c8cad1c385778 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 15:52:13 -0800 Subject: [PATCH 01/36] update with latest changes --- .../reference/easydata/conda-environments.md | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md index e698b52..724d131 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md @@ -4,13 +4,19 @@ The `{{ cookiecutter.repo_name }}` repo is set up with template code to make man If you haven't yet, configure your conda environment. +**WARNING**: If you have conda-forge listed as a channel in your `.condarc` (or any other channels other than defaults), you may experience great difficulty generating reproducible conda environments. + +We recommend you remove conda-forge (and all other non-default channels) from your `.condarc` file and [set your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html). You can still use conda-forge (or any other conda channel), just specify it explicitly in your `environment.yml` by prefixing your package name with `channel-name::`; e.g. +``` + - wheel # install from the default (anaconda) channel + - pytorch::pytorch # install this from the `pytorch` channel + - conda-forge::tokenizers # install this from conda-forge +``` + ## Configuring your python environment Easydata uses conda to manage python packages installed by both conda **and pip**. ### Adjust your `.condarc` -**WARNING FOR EXISTING CONDA USERS**: If you have `conda-forge` listed as a channel in your `.condarc` (or any other channels other than `default`), **remove them**. These channels should be specified in `environment.yml` instead. - -We also recommend [setting your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html) to reduce package incompatibility problems. This will be the default in conda 5.0, but in order to assure reproducibility, we need to use this behavior now. ``` conda config --set channel_priority strict @@ -26,18 +32,30 @@ conda config --prepend channels defaults conda config --prepend envs_dirs ~/.conda/envs # Store environments in local dir for JupyterHub ``` -### Fix the CONDA_EXE path -* Make note of the path to your conda binary: +#### Locating the `conda` binary +Ensure the Makefile can find your conda binary, either by setting the `CONDA_EXE` environment variable, or by modifying `Makefile.include` directly. + +First, check if `CONDA_EXE` is already set ``` - $ which conda + >>> export | grep CONDA_EXE + CONDA_EXE=/Users/your_username/miniconda3/bin/conda +``` + +If `CONDA_EXE` is not set, you will need to set it manually in `Makefile.include`; i.e. + +* Make note of the path to your conda binary. It should be in the `bin` subdirectory of your Anaconda (or miniconda) installation directory: +``` + >>> which conda # this will only work if conda is in your PATH, otherwise, verify manually ~/miniconda3/bin/conda ``` -* ensure your `CONDA_EXE` environment variable is set correctly in `Makefile.include` +* ensure your `CONDA_EXE` environment variable is set to this value; i.e. ``` - export CONDA_EXE=~/miniconda3/bin/conda + >>> export CONDA_EXE=~/miniconda3/bin/conda ``` +or edit `Makefile.include` directly. + ### Create the conda environment -* Create and switch to the virtual environment: +Create and switch to the virtual environment: ``` cd {{ cookiecutter.repo_name }} make create_environment From 82239965fc9cbd7a432b8454f0e45143581ceef7 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 16:19:31 -0800 Subject: [PATCH 02/36] clean up --- .../reference/easydata/datasets.md | 29 ++++++++++--------- .../reference/easydata/git-workflow.md | 16 +++++----- .../reference/easydata/notebooks.md | 4 +-- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md index 5b54c1e..2cd4687 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md @@ -3,8 +3,8 @@ ## TL;DR To get started, all you really need to know is that you can query for available datasets via ```python -from {{ cookiecutter.module_name }} import workflow -workflow.dataset_catalog() +from {{ cookiecutter.module_name }}.data import Catalog +Catalog.load("datasets") ``` and load these datasets via @@ -15,15 +15,18 @@ ds = Dataset.load(dataset_name) If you've followed the instructions from building the repo contained in the [README](../README.md), this should just work (if it doesn't, please let us know)! -You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.DESCR` and `ds.LICENSE`. +You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.README` and `ds.LICENSE`. -**Warning**: some of the datasets can be quite large. If you want to store your data externally, we recommend symlinking your data directory (that is the `{{ cookiecutter.repo_name }}/data` directory) to somewhere with more room before loading your first `Dataset`. +**Disk Space Note**: sometimes datasets can be quite large. If you want to store your data externally, we recommend pointing your data directory to a new location; that is, +```python +from {{ cookiecutter.module_name }} import paths +paths["data_path"] = "/path/to/big/data/directory" +``` ## Digging Deeper It is useful to know a little bit more about how Datasets work. - ## What is a `Dataset` object? A Dataset is the fundamental object we use for turning raw data into useful datasets, reproducibly. It is like a scikit-learn-style `Bunch` object --- essentially, a dictionary with some extra magic to make it nicer to work with --- containing the following attributes: @@ -36,7 +39,7 @@ A Dataset is the fundamental object we use for turning raw data into useful data The `data` attribute can really be any processed data form that you like: sometimes it's a pandas dataframe (like with `wine_reviews_130k`), a list of tuples containing other data, (`reddit_comment_tree_graphs`), or other formats including `scipy.sparse` matrices or `igraph` graphs. The `target` (if you're using it), expects something that matches the `data` in terms of length. -For a hint as to which data format to expect, you can look at the contents of the `DESCR` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object. +For a hint as to which data format to expect, you can look at the contents of the `README` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object. This `metadata` is where things get interesting... which we'll cover on its own next. @@ -44,9 +47,9 @@ This `metadata` is where things get interesting... which we'll cover on its own The `metadata` is where the magic lives. It serves several purposes in terms of bookkeeping: * it includes `HASHES`, which **improve data reproducibility**, since what you download and process gets checked each step along the way to ensure the raw data matches what is stored in the `dataset_catalog`, -* it provides easy access to **what the data is** via the `DESCR` attribute, +* it provides easy access to **what the data is** via the `README` attribute, * it provides easy (and continual) **access to the license / usage restrictions** for the data (the `LICENSE` attribute), which helps with knowing what you can do when [Sharing your Work](sharing-your-work.md). -* it provides the **extra data manifest**, `EXTRA`, if your dataset includes around additional raw data (extra) files. +* it provides the **fileset data manifest**, `FILESET`, if your dataset includes around additional raw data (extra) files. In short, it helps you to know what data you're working with, what you can do with it, and whether something has gone wrong. @@ -73,21 +76,19 @@ ds.metadata To access the most common metadata fields: ```python -ds.DESCR # or ds.metadata['descr'] +ds.README # or ds.metadata['descr'] ds.LICENSE # or ds.metadata['license'] ds.HASHES # or ds.metadata['hashes'] ``` ## The catalog -While we do our best to keep the documentation in [Available Datasets](docs/available-datasets.md) up-to-date with what's in the code, you can explore all of the currently available `Datasets` via the `dataset_catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded. +You can explore all of the currently available `Datasets` via the Dataset `Catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded. To access the catalog: ```python -from {{ cookiecutter.module_name }} import workflow -workflow.dataset_catalog(keys_only=True) +from {{ cookiecutter.module_name }}.data import Catalog +Catalog.load("datasets') ``` -If you're interested, set `keys_only=False` to see the complete contents of the metadata that is saved in the catalog. - ## Sharing your Data as a `Dataset` object In order to convert your data to a `Dataset` object, you will need to generate a catalog *recipe*, that uses a custom *function for processing your raw data*. Doing so allows us to document all the munging, pre-processing, and data verification necessary to reproducibly build the dataset. diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md index ce9e87d..3eecd80 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md @@ -1,5 +1,5 @@ -# The Easydata Git Workflow -Here's our suggestion for a reliable git workflow that works well in small team settings using [Easydata][cookiecutter-easydata]. +# The EasyData Git Workflow +Here's our suggestion for a reliable git workflow that works well in small team settings using [Easydata][easydata]. ## Git configuration @@ -49,7 +49,7 @@ git merge {{cookiecutter.default_branch}} git push origin my_branch ``` -### Do I have any stale branches? +### Clean up the junk With your local `{{cookiecutter.default_branch}}`, `origin/{{cookiecutter.default_branch}}` and `upstream/{{cookiecutter.default_branch}}` all in sync, we like to clean up any old branches that are fully merged (and hence, can be deleted without data loss.) ```bash git branch --merged {{cookiecutter.default_branch}} @@ -58,15 +58,15 @@ git branch -d A really great feature of `git branch -d` is that it will refuse to remove a branch that hasn't been fully merged into another. Thus it's safe to use without any fear of data loss. -### Time to start the day +### Start the day Once you've finished all your merge tasks, you can create a clean working branch from the latest `{{cookiecutter.default_branch}}` by doing a: ```bash git checkout {{cookiecutter.default_branch}} git checkout -b new_branch_name ``` +That's it! Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue in our +[easydata issue tracker]. -That's it!. Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue at -[cookiecutter-easydata]. - -[cookiecutter-easydata]: https://github.com/hackalog/cookiecutter-easydata/ \ No newline at end of file +[easydata issue tracker]: https://github.com/hackalog/easydata/issues +[easydata]: https://github.com/hackalog/easydata \ No newline at end of file diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md index f975369..4bae065 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md @@ -79,8 +79,7 @@ output_notebook(resources=INLINE) # Source module imports from {{ cookiecutter.module_name }} import paths -from {{ cookiecutter.module_name }}.data import DataSource, Dataset -from {{ cookiecutter.module_name }} import workflow +from {{ cookiecutter.module_name }}.data import DataSource, Dataset, Catalog ``` You can also find most of these header cells in [00-xyz-sample-notebook.ipynb](../notebooks/00-xyz-sample-notebook.ipynb) @@ -99,6 +98,7 @@ There is a whole world of cell magics. These are bits of code that you can put a * [README](../README.md) * [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md) * [Getting and Using Datasets](datasets.md) +* [Specifying Paths in Easydata](paths.md) * [Using Notebooks for Analysis](notebooks.md) * [Sharing your Work](sharing-your-work.md) * [Troubleshooting Guide](troubleshooting.md) From ef04660557385d1c9a30117f78b58bcb38f81e8d Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 17:08:56 -0800 Subject: [PATCH 03/36] sync with snp changes, especially descr -> readme and extra -> fileset --- .../__init__.py | 1 + .../{{ cookiecutter.module_name }}/_paths.py | 39 ++-- .../data/__init__.py | 2 +- .../data/datasets.py | 189 +++++++++------ .../data/extra.py | 88 ------- .../data/transformer_functions.py | 74 ++++-- .../{{ cookiecutter.module_name }}/helpers.py | 218 +++++++++++++++++- .../workflow.py | 11 +- 8 files changed, 411 insertions(+), 211 deletions(-) delete mode 100644 {{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py index cd3ea61..872135a 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py @@ -14,6 +14,7 @@ 'project_path': '${catalog_path}/..', 'raw_data_path': '${data_path}/raw', 'template_path': '${project_path}/reference/templates', + 'abfs_cache': '${interim_data_path}/abfs_cache', } _catalog_file = _module_dir.parent / "catalog" / "config.ini" diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py index 1c23d32..cc3a82c 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py @@ -1,7 +1,7 @@ from .decorators import SingletonDecorator from .kvstore import KVStore from .log import logger -import pathlib +import pathlib import Path class PathStore(KVStore): """Persistent Key-Value store for project-level paths @@ -13,15 +13,16 @@ class PathStore(KVStore): By default, the project directory is the parent of the directory containing the `config_file`: - >>> b['project_path'] - PosixPath('/tmpx/project') - >>> b['data_path'] - PosixPath('/tmpx/project/data') + + >>> b['project_path'] == Path('/tmpx/project').resolve() + True + >>> b['data_path'] == Path('/tmpx/project/data').resolve() + True The `catalog_path` is set upon instantiation and is read-only: - >>> b['catalog_path'] - PosixPath('/tmpx/project/catalog') + >>> b['catalog_path'] == Path('/tmpx/project/catalog').resolve() + True >>> b['catalog_path'] = '/tmp' Traceback (most recent call last): ... @@ -30,21 +31,21 @@ class PathStore(KVStore): Changing a value changes all values that expand to contain it: >>> b['project_path'] = '/tmpy' - >>> b['project_path'] - PosixPath('/tmpy') - >>> b['data_path'] - PosixPath('/tmpy/data') + >>> b['project_path'] == Path('/tmpy').resolve() + True + >>> b['data_path'] == Path('/tmpy/data').resolve() + True We can have multiple levels of expansion: >>> b['raw_data_path'] = "${data_path}/raw" - >>> b['raw_data_path'] - PosixPath('/tmpy/data/raw') + >>> b['raw_data_path'] == Path('/tmpy/data/raw').resolve() + True >>> b['project_path'] = '/tmp3' - >>> b['data_path'] - PosixPath('/tmp3/data') - >>> b['raw_data_path'] - PosixPath('/tmp3/data/raw') + >>> b['data_path'] == Path('/tmp3/data').resolve() + True + >>> b['raw_data_path'] == Path('/tmp3/data/raw').resolve() + True """ # These keys should never be written to disk, though they may be used @@ -58,7 +59,7 @@ def __init__(self, *args, if config_file is None: self._config_file = "config.ini" else: - self._config_file = pathlib.Path(config_file) + self._config_file = Path(config_file) self._usage_warning = False super().__init__(*args, config_section=config_section, config_file=self._config_file, **kwargs) @@ -88,7 +89,7 @@ def __getitem__(self, key): if key in self._protected: return getattr(self, key) self._read() - return pathlib.Path(super().__getitem__(key)).resolve() + return Path(super().__getitem__(key)).resolve() @property def catalog_path(self): diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py index 4e7b43e..81d21fc 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py @@ -2,4 +2,4 @@ from .datasets import * from .fetch import * from .utils import * -from .extra import * +from .fileset import * diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py index 2fa411c..7f44b47 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py @@ -88,8 +88,10 @@ def __init__(self, catalog_file='datasets.json', **kwargs): """ - Object representing a dataset object. - Notionally compatible with scikit-learn's Bunch object + EasyData Dataset container Object. + + Contains metadata (README, LICENSE), associated file list (FILESET), and + optionally a data object. dataset_name: string (required) key to use for this dataset @@ -99,7 +101,7 @@ def __init__(self, Either classification target or label to be used. for each of the points in `data` metadata: dict - Data about the object. Key fields include `license_txt`, `descr`, and `hashes` + Data about the object. Key fields include `license`, `readme`, and `hashes` update_hashes: Boolean If True, recompute the data/target hashes in the Metadata """ @@ -118,7 +120,7 @@ def __init__(self, self['metadata']['dataset_name'] = dataset_name self['data'] = data self['target'] = target - #self['extra'] = Extra.from_dict(metadata.get('extra', None)) + #self['fileset'] = Fileset.from_dict(metadata.get('fileset', None)) data_hashes = self._generate_data_hashes() if update_hashes: @@ -153,10 +155,10 @@ def __setattr__(self, key, value): self['metadata'][key.lower()] = value elif key == 'name': self['metadata']['dataset_name'] = value - elif key in ['extra_base', 'extra_auth_kwargs']: + elif key in ['fileset_base', 'fileset_auth']: if self.name not in paths._config.sections(): paths._config.add_section(self.name) - if key == 'extra_auth_kwargs': + if key == 'fileset_auth': paths._config.set(self.name, key, json.dumps(value, sort_keys=True)) else: paths._config.set(self.name, key, value) @@ -170,7 +172,7 @@ def __delattr__(self, key): del self['metadata'][key.lower()] elif key == 'name': raise ValueError("name is mandatory") - elif key == 'extra_base': + elif key == 'fileset_base': if paths._config.has_section(self.name) and paths._config.has_option(self.name, key): paths._config.remove_option(self.name, key) paths._write() @@ -226,26 +228,67 @@ def resolve_local_config(self, key, default=None, kind="string"): raise ValueError(f"Unknown kind: {kind}") @property - def extra_base(self): - return self.resolve_local_config("extra_base", paths['processed_data_path'] / f"{self.name}.extra") + def fileset_base(self): + return self.resolve_local_config("fileset_base", paths['processed_data_path'] / f"{self.name}.fileset") @property - def extra_auth_kwargs(self): - return self.resolve_local_config("extra_auth_kwargs", "{}", kind="json") + def fileset_auth(self): + return self.resolve_local_config("fileset_auth", "{}", kind="json") + + def filesystem(self): + """Return an fsspec filesystem object associated with this fileset_base. + + If present, the kwargs specified in 'Dataset.fileset_auth' will be used to authenticate the connection. These must be valid + parameters to 'fsspec.open()' + + returns: fsspec.FileSystem object + + """ + f = fsspec.open(self.fileset_base, **self.fileset_auth) + return f.fs + + def fileset(self, dirs_only=False): + """Enumerate contents of fileset. + + Automatically prepends `fileset_base` + + Parameters:: + dirs_only: Boolean + if True, returns only directory names containing files + if False, returns files and their associated hashes + + Useful for file formats that are actually directories, like parquet + + Returns: + if dirs_only is True: + list of directories containing files in the fileset + else + tuples of filenames, hashlists for every file in the fileset + """ + eb = self.fileset_base + sep = "/" + ret = [] + for subdir, filedict in self.FILESET.items(): + if dirs_only: + ret.append(sep.join([eb, subdir])) + else: # returns all files + for f, hashlist in filedict.items(): + ret.append((sep.join([eb, subdir, f]), hashlist)) + return ret # Note: won't work because of set/setattr magic above - #@extra_base.deleter - #def extra_base(self): - # if paths._config.has_section(self.name) and paths._config.has_option(self.name, "extra_base"): - # paths._config.remove_option("extra_base") + #@fileset_base.deleter + #def fileset_base(self): + # if paths._config.has_section(self.name) and paths._config.has_option(self.name, "fileset_base"): + # paths._config.remove_option("fileset_base") # Note: Won't work because of setattr magic above - #@extra_base.setter - #def extra_base(self, val): + #@fileset_base.setter + #def fileset_base(self, val): # if self.name not in paths._config.sections(): # paths._config.add_section(self.name) - # paths._config.set(self.name, "extra_base", val) + # paths._config.set(self.name, "fileset_base", val) # paths._write() # logger.debug(f"Writing {paths._config_file}") @@ -579,22 +622,22 @@ def verify_hashes(self, hashdict=None, catalog_path=None): hashdict = c[self.name]["hashes"] return hashdict.items() <= self.metadata['hashes'].items() - def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, hash_types=['size']): + def verify_fileset(self, fileset_base=None, file_dict=None, return_filelists=False, hash_types=['size']): """ - Verify that all files listed in the metadata EXTRA dict are accessible and have good hashes. + Verify that all files listed in the metadata FILESET dict are accessible and have good hashes. Returns boolean - True if all files are accessible and have good hashes - and optional file lists. Parameters ---------- - extra_base: path or None - base for the EXTRA filenames. + fileset_base: path or None + base for the FILESET filenames. if passed as explicit parameter, this location will be used - if omitted, the dataset `extra_base` will be read (which checks the local_config, - or self.EXTRA_BASE, in that order) - file_dict: sub-dict of extra dict - if None, default to the whole extra dict + if omitted, the dataset `fileset_base` will be read (which checks the local_config, + or self.FILESET_BASE, in that order) + file_dict: sub-dict of fileset dict + if None, default to the whole fileset dict return_filelists: boolean, default False if True, returns triple (good_hashes, bad_hashes, missing_files) else, returns Boolean (all files good) @@ -617,19 +660,19 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, files that are inaccessible """ - if extra_base is None: - extra_base = self.extra_base - extra_base = pathlib.Path(extra_base) - extra_dict = self.metadata.get('extra', None) + if fileset_base is None: + fileset_base = self.fileset_base + fileset_base = pathlib.Path(fileset_base) + fileset_dict = self.metadata.get('fileset', None) if file_dict is None: - file_dict = extra_dict + file_dict = fileset_dict else: - if not (file_dict.keys() <= extra_dict.keys()): - raise ValueError(f"file_dict must be a subset of the metadata['extra'] dict") + if not (file_dict.keys() <= fileset_dict.keys()): + raise ValueError(f"file_dict must be a subset of the metadata['fileset'] dict") else: for key in file_dict.keys(): - if not (file_dict[key].items() <= extra_dict[key].items()): - raise ValueError(f"file_dict must be a subset of the metadata['extra'] dict") + if not (file_dict[key].items() <= fileset_dict[key].items()): + raise ValueError(f"file_dict must be a subset of the metadata['fileset'] dict") retval = False bad_hash = [] @@ -641,7 +684,7 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, else: for directory in file_dict.keys(): for file, meta_hash_list in file_dict[directory].items(): - path = extra_base / directory / file + path = fileset_base / directory / file rel_path = pathlib.Path(directory) / file if path.exists(): disk_hash_list = [] @@ -660,52 +703,52 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, else: return retval - def subselect_extra(self, rel_files): - """Convert a (relative) pathname to an EXTRA dict + def subselect_fileset(self, rel_files): + """Convert a (relative) pathname to an FILESET dict - Suitable for passing to verify_extra() + Suitable for passing to verify_fileset() """ - extra_dict = defaultdict(dict) + fileset_dict = defaultdict(dict) for rel_file_path in rel_files: rel_path = pathlib.Path(rel_file_path) try: - hashlist = self.EXTRA[str(rel_path.parent)][rel_path.name] + hashlist = self.FILESET[str(rel_path.parent)][rel_path.name] except KeyError: - raise NotFoundError(f"Not in EXTRA: {rel_file_path}") from None - extra_dict[str(rel_path.parent)][rel_path.name] = hashlist - return dict(extra_dict) + raise NotFoundError(f"Not in FILESET: {rel_file_path}") from None + fileset_dict[str(rel_path.parent)][rel_path.name] = hashlist + return dict(fileset_dict) - def extra_file(self, relative_path): - """Convert a relative path (relative to extra_base) to a fully qualified location + def fileset_file(self, relative_path): + """Convert a relative path (relative to fileset_base) to a fully qualified location - extra_base may be prefixed with optional protocol like `s3://` and + fileset_base may be prefixed with optional protocol like `s3://` and is suitable for passing to fsspec.open_files() Parameters ---------- relative_path: string or list - Relative filepath. Will be appended to extra_base (and an intervening '/' added as needed) - extra_base can be prefixed with a protocol like `s3://` to read from alternate filesystems. + Relative filepath. Will be appended to fileset_base (and an intervening '/' added as needed) + fileset_base can be prefixed with a protocol like `s3://` to read from alternate filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. """ - extra_base = self.extra_base - if extra_base.startswith("/"): - fqpath = str(pathlib.Path(extra_base) / relative_path) - elif extra_base.endswith('/'): - fqpath = f"{extra_base}{relative_path}" + fileset_base = self.fileset_base + if fileset_base.startswith("/"): + fqpath = str(pathlib.Path(fileset_base) / relative_path) + elif fileset_base.endswith('/'): + fqpath = f"{fileset_base}{relative_path}" else: - fqpath = f"{extra_base}/{relative_path}" + fqpath = f"{fileset_base}/{relative_path}" return fqpath - def open_extra(self, relative_path, auth_kwargs=None, **kwargs): - """Given a path (relative to extra_base), return an fsspec.OpenFile object + def open_fileset(self, relative_path, auth_kwargs=None, **kwargs): + """Given a path (relative to fileset_base), return an fsspec.OpenFile object Parameters ---------- relative_path: string or list - Relative filepath. Will be appended to extra_base (and an intervening '/' added as needed) - extra_base can be prefixed with a protocol like `s3://` to read from alternate filesystems. + Relative filepath. Will be appended to fileset_base (and an intervening '/' added as needed) + fileset_base can be prefixed with a protocol like `s3://` to read from alternate filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. auth_kwargs: dict or None @@ -717,7 +760,7 @@ def open_extra(self, relative_path, auth_kwargs=None, **kwargs): Examples -------- - >>> with ds.open_extra('2020-01-*.csv') as f: + >>> with ds.open_fileset('2020-01-*.csv') as f: ... df = pd.read_csv(f) # doctest: +SKIP Returns @@ -726,11 +769,11 @@ def open_extra(self, relative_path, auth_kwargs=None, **kwargs): be used as a single context """ if auth_kwargs is None: - auth_kwargs = self.extra_auth_kwargs + auth_kwargs = self.fileset_auth if auth_kwargs: logger.debug(f"Passing authentication information via auth_kwargs") - return fsspec.open(self.extra_file(relative_path), **auth_kwargs, **kwargs) + return fsspec.open(self.fileset_file(relative_path), **auth_kwargs, **kwargs) def dump(self, file_base=None, dump_path=None, hash_type='sha1', exists_ok=False, create_dirs=True, dump_metadata=True, update_catalog=True, @@ -867,8 +910,8 @@ def __init__(self, Value of hash used to verify file integrity file_name: string (optional) filename to use when saving file locally. If omitted, it will be inferred from url or source_file - name: string or {'DESCR', 'LICENSE'} (optional) - description of the file. of DESCR or LICENSE, will be used as metadata + name: string or {'README', 'LICENSE'} (optional) + description of the file. of README or LICENSE, will be used as metadata unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. @@ -909,14 +952,14 @@ def file_list(self): logger.warning("file_list is deprecated. Use file_dict instead") return list(self.file_dict.values()) - def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='DESCR', unpack_action='copy', force=False): + def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='README', unpack_action='copy', force=False): """Add metadata to a DataSource filename: create metadata entry from contents of this file. Relative to `metadata_path` contents: create metadata entry from this string metadata_path: (default `paths['raw_data_path']`) where to store metadata files - kind: {'DESCR', 'LICENSE'} + kind: {'README', 'LICENSE'} unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. force: boolean (default False) @@ -928,7 +971,7 @@ def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='D metadata_path = pathlib.Path(metadata_path) filename_map = { - 'DESCR': f'{self.name}.readme', + 'README': f'{self.name}.readme', 'LICENSE': f'{self.name}.license', } if kind not in filename_map: @@ -1337,7 +1380,7 @@ def process(self, return_X_y: boolean if True, returns (data, target) instead of a `Dataset` object. use_docstring: boolean - If True, the docstring of `self.process_function` is used as the Dataset DESCR text. + If True, the docstring of `self.process_function` is used as the Dataset README text. """ if not self.unpacked_: logger.debug("process() called before unpack()") @@ -1373,13 +1416,13 @@ def process(self, def default_metadata(self, use_docstring=False): """Returns default metadata derived from this DataSource - This sets the dataset_name, and fills in `license` and `descr` + This sets the dataset_name, and fills in `license` and `readme` fields if they are present, either on disk, or in the file list Parameters ---------- use_docstring: boolean - If True, the docstring of `self.process_function` is used as the Dataset DESCR text. + If True, the docstring of `self.process_function` is used as the Dataset README text. Returns ------- @@ -1388,12 +1431,12 @@ def default_metadata(self, use_docstring=False): metadata = {} optmap = { - 'DESCR': 'descr', + 'README': 'readme', 'LICENSE': 'license', } filemap = { 'license': f'{self.name}.license', - 'descr': f'{self.name}.readme' + 'readme': f'{self.name}.readme' } for key, fetch_dict in self.file_dict.items(): @@ -1406,7 +1449,7 @@ def default_metadata(self, use_docstring=False): if use_docstring: func = partial(self.process_function) fqfunc, invocation = partial_call_signature(func) - metadata['descr'] = f'Data processed by: {fqfunc}\n\n>>> ' + \ + metadata['readme'] = f'Data processed by: {fqfunc}\n\n>>> ' + \ f'{invocation}\n\n>>> help({func.func.__name__})\n\n' + \ f'{func.func.__doc__}' diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py deleted file mode 100644 index 74419cb..0000000 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -Functions for handling "extra" data; i.e. collections of raw files associated with a Dataset -""" - -from collections import defaultdict -import pathlib -import shutil -import os - -from tqdm.auto import tqdm - -from .. import paths -from ..log import logger - -__all__ = [ - 'process_extra_files', -] - -def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", extra_dir=".extra", dataset_dir=None, do_copy=False): - """ - Process unpacked raw files into its minimal dataset components (data, target, metadata). - Here, 'minimal' means `data` and `target` will be None, and `extra` will contain a - file dict of files matching the specified file_glob (and their sizes). - - Parameters - ---------- - unpack_dir: default paths['interim_data_path'] - The directory the interim data files have been unpacked into - dataset_dir: default paths['processed_data_path'] - location of processed datasets. - extract_dir: - Name of the directory of the unpacked zip file containing the raw data files. - relative to unpack_dir - file_glob: string - Add only files matching this glob pattern to EXTRA - extra_dir: string - Used in building the file_dict keys. - do_copy: boolean - if True, actually copy the files. Otherwise just build EXTRA - - Returns - ------- - (data, target, additional_metadata) - - where - - data and target are None, - - metadata contains a file dict; i.e. - 'extra': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...} - """ - if metadata is None: - metadata = {} - - if dataset_dir is None: - dataset_dir = paths['processed_data_path'] - else: - dataset_dir = pathlib.Path(dataset_dir) - if unpack_dir is None: - unpack_dir = paths['interim_data_path'] - else: - unpack_dir = pathlib.Path(unpack_dir) - if extract_dir is not None: - unpack_dir /= extract_dir - - extra_dir = pathlib.Path(extra_dir) - extra_dir_fq = dataset_dir / extra_dir - logger.debug(f"Do copy: {do_copy}") - if do_copy: - if extra_dir_fq.is_dir(): - logger.warning(f"Cleaning contents of {extra_dir}") - shutil.rmtree(extra_dir_fq) - logger.debug(f"Copying files to {extra_dir_fq}...") - - file_dict = defaultdict(dict) - files = sorted(list(unpack_dir.rglob(file_glob))) - for i, file in enumerate(tqdm(files)): - if file.is_dir(): - continue - relative_path = file.relative_to(unpack_dir) - extra_path = extra_dir / relative_path - file_dict[str(extra_path.parent)][str(extra_path.name)] = [f'size:{os.path.getsize(file)}'] - if do_copy: - os.makedirs(dataset_dir / extra_path.parent, exist_ok=True) - shutil.copyfile(file, dataset_dir / extra_path) - metadata['extra'] = dict(file_dict) - - return None, None, metadata diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py index 615a3bc..525ca5d 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py @@ -12,10 +12,11 @@ from ..utils import run_notebook __all__ = [ - 'run_notebook_transformer', 'apply_single_function', + 'copy_dataset', 'csv_to_pandas', 'new_dataset', + 'run_notebook_transformer', 'sklearn_train_test_split', 'sklearn_transform', ] @@ -163,23 +164,23 @@ def csv_to_pandas(ds_dict, *, output_map, **opts): new_ds = {} df = None for ds_name, dset in ds_dict.items(): - extra = dset.metadata.get('extra', None) - if extra is not None: - logger.debug(f"Input dataset {ds_name} has extra data. Processing...") - for rel_dir, file_dict in extra.items(): + fileset = dset.metadata.get('fileset', None) + if fileset is not None: + logger.debug(f"Input dataset {ds_name} has fileset data. Processing...") + for rel_dir, file_dict in fileset.items(): for new_dsname, csv_filename in output_map.items(): if csv_filename in file_dict: logger.debug(f"Found {csv_filename}. Creating {new_dsname} dataset") path = paths['processed_data_path'] / rel_dir / csv_filename df = pd.read_csv(path) new_metadata = dset.metadata - new_metadata.pop('extra', None) + new_metadata.pop('fileset', None) new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=df, metadata=new_metadata) return new_ds -def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_descr_txt, drop_extra, **opts): +def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_readme_txt, drop_fileset, **opts): """ Parameters ---------- @@ -189,12 +190,12 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali name of the dataset that the new dataset will be derived from dataset_name: name of the new dataset_catalog - added_descr_txt: Default None - new description text to be appended to the metadata descr + added_readme_txt: Default None + new description text to be appended to the metadata readme serialized_function: function (serialized by src.utils.serialize_partial) to run on .data to produce the new .data - drop_extra: boolean - drop the .extra part of the metadata + drop_fileset: boolean + drop the .fileset part of the metadata **opts: Remaining options will be ignored """ @@ -205,10 +206,10 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali ds = ds_dict.get(source_dataset_name) new_metadata = ds.metadata.copy() - new_metadata['descr'] += added_descr_txt - if drop_extra: - if new_metadata.get('extra', 0) != 0: - new_metadata.pop('extra') + new_metadata['readme'] += added_readme_txt + if drop_fileset: + if new_metadata.get('fileset', 0) != 0: + new_metadata.pop('fileset') logger.debug(f"Applying data function...") data_function=deserialize_partial(serialized_function) @@ -227,3 +228,46 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=preprocessed_corpus, metadata=new_metadata) return new_ds + +def copy_dataset(ds_dict, *, source_dataset_name, dataset_name, added_readme_txt, drop_fileset=True, **opts): + """ + Create a new dataset by copying an existing one + Parameters + ---------- + ds_dict: + input datasets. + source_dataset_name: + name of the dataset that the new dataset will be derived from + dataset_name: + name of the new dataset_catalog + added_readme_txt: Default None + new description text to be appended to the metadata readme + drop_fileset: boolean + drop the .fileset part of the metadata + **opts: + Remaining options will be ignored + """ + + new_ds = {} + + logger.debug(f"Loading {source_dataset_name}...") + ds = ds_dict.get(source_dataset_name) + + new_metadata = ds.metadata.copy() + new_metadata['readme'] += added_readme_txt + if drop_fileset: + if new_metadata.get('fileset', 0) != 0: + new_metadata.pop('fileset') + + if drop_data: + new_data = None + else: + new_data = ds.data.copy() + + if drop_target: + new_target = None + else: + new_target = ds.target.copy() + + new_ds[dataset_name] = Dataset(dataset_name=dataset_name, data=new_data, target=new_target, metadata=new_metadata) + return new_ds diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py index 1283bab..f103370 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py @@ -1,7 +1,9 @@ ## Script common ways of adding a dataset to the workflow from functools import partial +import fsspec import pathlib +import os from .log import logger from . import paths @@ -10,7 +12,7 @@ from .data import (DataSource, Dataset, hash_file, DatasetGraph, Catalog, serialize_transformer_pipeline) from .data.transformer_functions import csv_to_pandas, new_dataset, apply_single_function, run_notebook_transformer -from .data.extra import process_extra_files +from .data.fileset import process_fileset_files from .data.utils import serialize_partial __all__ = [ @@ -90,7 +92,7 @@ def notebook_as_transformer(notebook_name, *, # Create a Dataset from a single csv file def dataset_from_csv_manual_download(ds_name, csv_path, download_message, - license_str, descr_str, *, hash_type='sha1', + license_str, readme_str, *, hash_type='sha1', hash_value=None, overwrite_catalog=False,): """ @@ -107,7 +109,7 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message, Hash, computed via the algorithm specified in `hash_type` license_str: str Contents of metadata license as text - descr_str: + readme_str: Contents of the metadata description as text overwrite_catalog: boolean If True, existing entries in datasets and transformers catalogs will be @@ -136,15 +138,15 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message, hash_value=hash_value, unpack_action='copy', force=True) - dsrc.add_metadata(contents=descr_str, force=True) + dsrc.add_metadata(contents=readme_str, force=True) dsrc.add_metadata(contents=license_str, kind='LICENSE', force=True) - process_function = process_extra_files - process_function = process_extra_files + process_function = process_fileset_files + process_function = process_fileset_files process_function_kwargs = {'do_copy':True, 'file_glob':str(csv_path.name), - 'extra_dir': raw_ds_name+'.extra', - 'extract_dir': raw_ds_name} + 'fileset_dir': raw_ds_name+'.fileset', + 'filesetct_dir': raw_ds_name} dsrc.process_function = partial(process_function, **process_function_kwargs) datasource_catalog = Catalog.load('datasources') datasource_catalog[dsrc.name] = dsrc.to_dict() @@ -202,7 +204,7 @@ def dataset_from_metadata(dataset_name, metadata=None, overwrite_catalog=False): return ds -def dataset_from_single_function(*, source_dataset_name, dataset_name, data_function, added_descr_txt, drop_extra=True, overwrite_catalog=False): +def dataset_from_single_function(*, source_dataset_name, dataset_name, data_function, added_readme_txt, drop_fileset=True, overwrite_catalog=False): """ Create a derived dataset (dataset_name) via a single function call on .data from a previous dataset (source_dataset_name). @@ -213,8 +215,8 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func name of the dataset that the new dataset will be derived from dataset_name: name of the new dataset_catalog - added_descr_txt: Default None - new description text to be appended to the metadata descr + added_readme_txt: Default None + new description text to be appended to the metadata readme data_function: function (from src module) to run on .data to produce the new .data overwrite_catalog: boolean @@ -223,7 +225,7 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func dag = DatasetGraph(catalog_path=paths['catalog_path']) serialized_function = serialize_partial(data_function) transformers = [partial(apply_single_function, source_dataset_name=source_dataset_name, dataset_name=dataset_name, - serialized_function=serialized_function, added_descr_txt=added_descr_txt, drop_extra=drop_extra)] + serialized_function=serialized_function, added_readme_txt=added_readme_txt, drop_fileset=drop_fileset)] dag.add_edge(input_dataset=source_dataset_name, output_dataset=dataset_name, transformer_pipeline=serialize_transformer_pipeline(transformers), @@ -231,3 +233,195 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func ds = Dataset.from_catalog(dataset_name) logger.debug(f"{dataset_name} added to catalog") return ds + +def derived_dataset(*, dataset_name, source_dataset_name, added_readme_txt, + drop_fileset=True, drop_data=True, drop_target=False, + overwrite_catalog=False): + """ + Create a derived dataset (dataset_name) via a single function call on .data from a + previous dataset (source_dataset_name). + + Parameters + ---------- + source_dataset_name: + name of the dataset that the new dataset will be derived from + dataset_name: + name of the new dataset_catalog + added_readme_txt: Default None + new description text to be appended to the metadata readme + drop_fileset: boolean + If True, don't copy fileset data to new dataset + drop_data: boolean + If True, don't copy data to new dataset + drop_target: boolean + If True, don't copy target to new dataset + overwrite_catalog: boolean + if True, existing entries in datasets and transformers catalogs will be overwritten + """ + dag = DatasetGraph(catalog_path=paths['catalog_path']) + serialized_function = serialize_partial(data_function) + transformers = [partial(copy_dataset, source_dataset_name=source_dataset_name, dataset_name=dataset_name, + added_readme_txt=added_readme_txt, drop_fileset=drop_fileset, drop_data=drop_data, drop_target=drop_target)] + dag.add_edge(input_dataset=source_dataset_name, + output_dataset=dataset_name, + transformer_pipeline=serialize_transformer_pipeline(transformers), + overwrite_catalog=overwrite_catalog) + ds = Dataset.from_catalog(dataset_name) + logger.debug(f"{dataset_name} added to catalog") + return ds + +def metadata_from_fsspec(fs, path, metadata=None, fileset=None): + """Create metadata, FILESET file list from fsspec URL. + + Creates a metadata dict representing a dataset + + + filenames in all uppercase are assumed to be metadata fields + + remaining files are used to populate FILESET data and have their hashes computed. + + Parameters + ---------- + fs: + fsspec.filesystem instance (already connected) + path: + relative to fs + metadata: + current contents of metadata dict. + Metadata obtained from fsurl will overwrite any similarly named fields in this dict + fileset: + Current contents of FILESET. new data will be appended. + Similarly named entries will be overwritten. + + returns metadata dict + """ + # There's a chance this should get rewritten to use 'fsspec.walk' + + if metadata is None: + metadata = {} + if fileset is None: + fileset = metadata.get('fileset', {}) + protocol = fs.protocol + dirs_done = [] + dirs = [path] + + while dirs: + dirname = dirs.pop() + rel_dirname = os.path.relpath(dirname, start=path) + dirs_done.append(dirname) + for file_info in fs.ls(dirname, detail=True): + file_type = file_info.get('type', None) + file_name = file_info['name'] + if file_type == 'directory': + dirs.append(file_name) + elif file_type == 'file': + basename = os.path.basename(os.path.normpath(file_name)) + if str.isupper(basename): + # Add to metadata + with fs.open(file_name, 'r') as fr: + contents = '\n'.join(fr.readlines()) + metadata[str.lower(basename)] = contents + else: + # add file and hash to FILESET + if protocol == "abfs": + # Cheap way to get md5 + md5_arr = file_info['content_settings']['content_md5'] + hashval = f"md5:{''.join('{:02x}'.format(x) for x in md5_arr)}" + else: + logger.warning(f"Unsupported fsspec filesystem: {fs.protocol}. Using size as hash") + hashval = f"size:{fs.size(file_name)}" + rel_path = os.path.relpath(file_info['name'], start=dirname) or "." + # fileset[rel_dirname][rel_path] = [hashval] + entry = {rel_path:[hashval]} + fileset.setdefault(rel_dirname,{}).update(entry) + else: + raise Exception(f"Unknown file type: {file_type}") + metadata["fileset"] = fileset + return metadata + + + +def dataset_from_fsurl(fsurl, + dataset_name=None, + fsspec_auth=None, + metadata=None, + fileset=None, + overwrite_catalog=True): + """Create a dataset from the contents of an fsspec URL + + 'fsurl' is assumed to be a directory/container/bucket. + + Files in this bucket with names entirely in UPPERCASE are assumed + to be textfiles and are used to populate metadata fields directly + as metadata fields (e.g. README, LICENSE) + + Other files have their hashes added to FILESET, and are included in + the FileSet (FILESET data) associated with the dataset. + + Parameters:: + + fsurl: fsspec URL + Should be a "directory", container, or "subdirectory" of said container. + dataset_name: string or None + Name to use for Dataset. + if None, name is the last component of the fsurl path + metadata: + current contents of metadata dict. + Metadata obtained from fsurl will overwrite any similarly named fields in this dict + fileset: + Current contents of FILESET. new data will be appended. + Similarly named entries will be overwritten. + overwrite_catalog: Boolean + if True, entry in Dataset catalog will be overwritten with the newly generated Dataset + + Returns:: + Dataset containing only metadata and FILESET info for all files in the specified fsspec URL. + + """ + if fsspec_auth is None: + fsspec_auth = {} + + f = fsspec.open(fsurl, **fsspec_auth) + path = f.path + if dataset_name is None: + dataset_name = os.path.basename(os.path.normpath(path)) + logger.debug(f"Inferring dataset_name from fsurl: {dataset_name}") + fs = f.fs + protocol = fs.protocol + meta = metadata_from_fsspec(fs, path, metadata=metadata, fileset=fileset) + meta['fileset_base'] = fsurl + ds = dataset_from_metadata(dataset_name, + metadata=meta, + overwrite_catalog=overwrite_catalog) + return ds + +def derived_dataset(*, dataset_name, source_dataset, added_readme_txt=None, drop_fileset=True, data=None, target=None): + """Create a dataset by copying its metadata from another dataset + + Parameters + ---------- + added_readme_txt: string + String to be appended to the end of the new dataset's README metadata + drop_fileset: boolean + if True, ignore fileset when copying metadata + data: + Will be used as contents of new dataset's `data` + target: + Will be used as contents of new dataset's `target` + dataset_name: String + new dataset name + source_dataset: Dataset + Metadata will be copied from this dataset + + Returns + ------- + new (derived) Dataset object + """ + new_metadata = ds.metadata.copy() + if added_readme_txt: + new_metadata['readme'] += added_readme_txt + if drop_fileset: + if new_metadata.get('fileset', 0) != 0: + new_metadata.pop('fileset') + if new_metadata.get('hashes', 0) != 0: + new_metadata.pop('hashes') + ds_out = Dataset(dataset_name, metadata=new_metadata, data=data, target=target, **kwargs) + return ds_out diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py index df901a9..03c6d44 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py @@ -1,6 +1,11 @@ -# Workflow is where we patch around API issues in between releases. -# Nothing in this file is intended to be a stable API. use at your own risk, -# as its contents will be regularly deprecated +"""A module where we temporarily smooth our way around API issues in Easydata. + +This is a place where we temporarily address UX and API issues in Easydata, usually by writing convenient wrappers around existing functionality. + +Nothing in here is intended to be a stable API, so use at your own risk, as these contents are regularly deprecated. + +""" + import sys import logging from .data import Catalog, Dataset, DataSource From 58db18895d720f6ea7fc82d9f114d4556de3a7ee Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 17:33:00 -0800 Subject: [PATCH 04/36] fix typos --- .../reference/easydata/datasets.md | 6 +++--- .../reference/easydata/git-workflow.md | 2 +- .../reference/easydata/notebooks.md | 2 +- .../{{ cookiecutter.module_name }}/_paths.py | 2 +- .../data/transformer_functions.py | 8 -------- .../{{ cookiecutter.module_name }}/helpers.py | 7 +++++-- 6 files changed, 11 insertions(+), 16 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md index 2cd4687..ff5923c 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md @@ -49,7 +49,7 @@ The `metadata` is where the magic lives. It serves several purposes in terms of * it includes `HASHES`, which **improve data reproducibility**, since what you download and process gets checked each step along the way to ensure the raw data matches what is stored in the `dataset_catalog`, * it provides easy access to **what the data is** via the `README` attribute, * it provides easy (and continual) **access to the license / usage restrictions** for the data (the `LICENSE` attribute), which helps with knowing what you can do when [Sharing your Work](sharing-your-work.md). -* it provides the **fileset data manifest**, `FILESET`, if your dataset includes around additional raw data (extra) files. +* it provides the **fileset data manifest**, `FILESET`, if your dataset includes around additional raw data (fileset) files. In short, it helps you to know what data you're working with, what you can do with it, and whether something has gone wrong. @@ -76,7 +76,7 @@ ds.metadata To access the most common metadata fields: ```python -ds.README # or ds.metadata['descr'] +ds.README # or ds.metadata['readme'] ds.LICENSE # or ds.metadata['license'] ds.HASHES # or ds.metadata['hashes'] ``` @@ -87,7 +87,7 @@ To access the catalog: ```python from {{ cookiecutter.module_name }}.data import Catalog -Catalog.load("datasets') +Catalog.load("datasets") ``` ## Sharing your Data as a `Dataset` object diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md index 3eecd80..50d5179 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md @@ -1,5 +1,5 @@ # The EasyData Git Workflow -Here's our suggestion for a reliable git workflow that works well in small team settings using [Easydata][easydata]. +Here's our suggestion for a reliable git workflow that works well in **small team settings**; e.g. when using [Easydata][easydata] in a group setting. ## Git configuration diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md index 4bae065..270775c 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md @@ -96,7 +96,7 @@ There is a whole world of cell magics. These are bits of code that you can put a ### Quick References * [README](../README.md) -* [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md) +* [Setting up and Maintaining your Conda Environment, Reproducibly](conda-environments.md) * [Getting and Using Datasets](datasets.md) * [Specifying Paths in Easydata](paths.md) * [Using Notebooks for Analysis](notebooks.md) diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py index cc3a82c..b938c9f 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py @@ -1,7 +1,7 @@ from .decorators import SingletonDecorator from .kvstore import KVStore from .log import logger -import pathlib import Path +from pathlib import Path class PathStore(KVStore): """Persistent Key-Value store for project-level paths diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py index 525ca5d..7cdf6ad 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py @@ -178,8 +178,6 @@ def csv_to_pandas(ds_dict, *, output_map, **opts): new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=df, metadata=new_metadata) return new_ds - - def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_readme_txt, drop_fileset, **opts): """ Parameters @@ -223,12 +221,6 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali new_ds[dataset_name] = Dataset(dataset_name=dataset_name, data=new_data, target=new_target, metadata=new_metadata) return new_ds - - new_metadata = ds.metadata.copy() - - new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=preprocessed_corpus, metadata=new_metadata) - return new_ds - def copy_dataset(ds_dict, *, source_dataset_name, dataset_name, added_readme_txt, drop_fileset=True, **opts): """ Create a new dataset by copying an existing one diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py index f103370..186704c 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py @@ -16,10 +16,13 @@ from .data.utils import serialize_partial __all__ = [ - 'notebook_as_transformer', 'dataset_from_csv_manual_download', + 'dataset_from_fsurl', 'dataset_from_metadata', 'dataset_from_single_function', + 'derived_dataset', + 'metadata_from_fsspec', + 'notebook_as_transformer', ] @@ -146,7 +149,7 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message, process_function_kwargs = {'do_copy':True, 'file_glob':str(csv_path.name), 'fileset_dir': raw_ds_name+'.fileset', - 'filesetct_dir': raw_ds_name} + 'extract_dir': raw_ds_name} dsrc.process_function = partial(process_function, **process_function_kwargs) datasource_catalog = Catalog.load('datasources') datasource_catalog[dsrc.name] = dsrc.to_dict() From 6c6d7ef7a1ae25249a48ad62da54cf10a73cbc67 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 17:39:03 -0800 Subject: [PATCH 05/36] add missing file --- .../data/fileset.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 {{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py new file mode 100644 index 0000000..f69aced --- /dev/null +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py @@ -0,0 +1,88 @@ +""" +Functions for handling "fileset" data; i.e. collections of raw files associated with a Dataset +""" + +from collections import defaultdict +import pathlib +import shutil +import os + +from tqdm.auto import tqdm + +from .. import paths +from ..log import logger + +__all__ = [ + 'process_fileset_files', +] + +def process_fileset_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", fileset_dir=".fileset", dataset_dir=None, do_copy=False): + """ + Process unpacked raw files into its minimal dataset components (data, target, metadata). + Here, 'minimal' means `data` and `target` will be None, and `fileset` will contain a + file dict of files matching the specified file_glob (and their sizes). + + Parameters + ---------- + unpack_dir: default paths['interim_data_path'] + The directory the interim data files have been unpacked into + dataset_dir: default paths['processed_data_path'] + location of processed datasets. + extract_dir: + Name of the directory of the unpacked zip file containing the raw data files. + relative to unpack_dir + file_glob: string + Add only files matching this glob pattern to FILESET + fileset_dir: string + Used in building the file_dict keys. + do_copy: boolean + if True, actually copy the files. Otherwise just build FILESET + + Returns + ------- + (data, target, additional_metadata) + + where + + data and target are None, + + metadata contains a file dict; i.e. + 'fileset': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...} + """ + if metadata is None: + metadata = {} + + if dataset_dir is None: + dataset_dir = paths['processed_data_path'] + else: + dataset_dir = pathlib.Path(dataset_dir) + if unpack_dir is None: + unpack_dir = paths['interim_data_path'] + else: + unpack_dir = pathlib.Path(unpack_dir) + if extract_dir is not None: + unpack_dir /= extract_dir + + fileset_dir = pathlib.Path(fileset_dir) + fileset_dir_fq = dataset_dir / fileset_dir + logger.debug(f"Do copy: {do_copy}") + if do_copy: + if fileset_dir_fq.is_dir(): + logger.warning(f"Cleaning contents of {fileset_dir}") + shutil.rmtree(fileset_dir_fq) + logger.debug(f"Copying files to {fileset_dir_fq}...") + + file_dict = defaultdict(dict) + files = sorted(list(unpack_dir.rglob(file_glob))) + for i, file in enumerate(tqdm(files)): + if file.is_dir(): + continue + relative_path = file.relative_to(unpack_dir) + fileset_path = fileset_dir / relative_path + file_dict[str(fileset_path.parent)][str(fileset_path.name)] = [f'size:{os.path.getsize(file)}'] + if do_copy: + os.makedirs(dataset_dir / fileset_path.parent, exist_ok=True) + shutil.copyfile(file, dataset_dir / fileset_path) + metadata['fileset'] = dict(file_dict) + + return None, None, metadata From 210e7c5b50d00e5c6dad0f7184a15847fa89294e Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 18:00:02 -0800 Subject: [PATCH 06/36] fix test dataset generation --- .../data/process_functions.py | 38 +++++++++++ .../tests/make_test_datasets.py | 67 +++++-------------- 2 files changed, 54 insertions(+), 51 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py index 6054735..31cbb1e 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py @@ -3,6 +3,7 @@ """ import pathlib +from sklearn.datasets import fetch_20newsgroups from tqdm.auto import tqdm @@ -10,4 +11,41 @@ from ..log import logger __all__ = [ + 'process_20_newsgroups' ] + +def process_20_newsgroups(*, extract_dir='20_newsgroups', + metadata=None, unpack_dir=None, + opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}): + """ + Process 20 newsgroups into (data, target, metadata) format. + + + Parameters + ---------- + unpack_dir: path + The interim parent directory the dataset files have been unpacked into. + extract_dir: str + Name of the directory of the unpacked files relative to the unpack_dir. Note that + opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"} + Options to pass to sklearn.datasets.fetch_20newsgroups. + + + Returns + ------- + A tuple: + (data, target, additional_metadata) + + """ + if metadata is None: + metadata = {} + + if unpack_dir is None: + unpack_dir = paths['interim_data_path'] + else: + unpack_dir = pathlib.Path(unpack_dir) + data_dir = unpack_dir / f"{extract_dir}" + + news = fetch_20newsgroups(**opts) + + return news.data, news.target, metadata diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py index 056f497..6e8d66e 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py @@ -1,16 +1,13 @@ from sklearn.datasets import fetch_20newsgroups from functools import partial -from {{ cookiecutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog -from {{ cookiecutter.module_name }} import workflow, paths -from {{ cookiecutter.module_name }}.log import logger +from src.data import DataSource, Dataset, DatasetGraph, Catalog +from src.data.process_functions import process_20_newsgroups +from src import paths +from src.log import logger # Set up a 20 newsgroups dataset -ds_name = '20_newsgroups' -output_ds_name = ds_name -dsrc = DataSource(ds_name) - license = """ Custom Academic License: "You may use this material free of charge for any educational purpose, provided attribution is given in any lectures or publications that make use of this material." As in http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.data.html. """ @@ -46,51 +43,19 @@ By default we follow the sklearn suggestion to set `remove=('headers', 'footers', 'quotes')` to avoid overfitting. """ +if __name__ =='__main__': + ds_name = '20_newsgroups' + output_ds_name = ds_name + dsrc = DataSource(ds_name) -dsrc.add_metadata(contents=metadata, force=True) -dsrc.add_metadata(contents=license, kind='LICENSE', force=True) - -def process_20_newsgroups(*, extract_dir='20_newsgroups', - metadata=None, unpack_dir=None, - opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}): - """ - Process 20 newsgroups into (data, target, metadata) format. - - - Parameters - ---------- - unpack_dir: path - The interim parent directory the dataset files have been unpacked into. - extract_dir: str - Name of the directory of the unpacked files relative to the unpack_dir. Note that - opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"} - Options to pass to sklearn.datasets.fetch_20newsgroups. - - - Returns - ------- - A tuple: - (data, target, additional_metadata) - - """ - if metadata is None: - metadata = {} - - if unpack_dir is None: - unpack_dir = paths['interim_data_path'] - else: - unpack_dir = pathlib.Path(unpack_dir) - data_dir = unpack_dir / f"{extract_dir}" - - news = fetch_20newsgroups(**opts) - - return news.data, news.target, metadata + dsrc.add_metadata(contents=metadata, force=True) + dsrc.add_metadata(contents=license, kind='LICENSE', force=True) -process_function = process_20_newsgroups -process_kwargs = {} + process_function = process_20_newsgroups + process_kwargs = {} -dsrc.process_function = partial(process_function, **process_kwargs) -dsrc.update_catalog() + dsrc.process_function = partial(process_function, **process_kwargs) + dsrc.update_catalog() -dag = DatasetGraph() -dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True) + dag = DatasetGraph() + dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True) From 944c39477209f8d365133d1e15f60c567dd83397 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 18:02:25 -0800 Subject: [PATCH 07/36] remove use of src --- .../tests/make_test_datasets.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py index 6e8d66e..90db289 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py @@ -1,10 +1,10 @@ from sklearn.datasets import fetch_20newsgroups from functools import partial -from src.data import DataSource, Dataset, DatasetGraph, Catalog -from src.data.process_functions import process_20_newsgroups -from src import paths -from src.log import logger +from {{ cookicutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog +from {{ cookicutter.module_name }}.data.process_functions import process_20_newsgroups +from {{ cookicutter.module_name }} import paths +from {{ cookicutter.module_name }}.log import logger # Set up a 20 newsgroups dataset From 4f2d1189b8fb5b2989f7834373b93aea3bf22258 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 18:03:43 -0800 Subject: [PATCH 08/36] fix typo --- .../tests/make_test_datasets.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py index 90db289..31f55a8 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py @@ -1,10 +1,10 @@ from sklearn.datasets import fetch_20newsgroups from functools import partial -from {{ cookicutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog -from {{ cookicutter.module_name }}.data.process_functions import process_20_newsgroups -from {{ cookicutter.module_name }} import paths -from {{ cookicutter.module_name }}.log import logger +from {{ cookiecutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog +from {{ cookiecutter.module_name }}.data.process_functions import process_20_newsgroups +from {{ cookiecutter.module_name }} import paths +from {{ cookiecutter.module_name }}.log import logger # Set up a 20 newsgroups dataset From 6f60931f215e78c0f5f54a9feed89d7402e0670a Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 18:47:38 -0800 Subject: [PATCH 09/36] try using a miniconda image --- .../.circleci/config.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/.circleci/config.yml b/{{ cookiecutter.repo_name }}/.circleci/config.yml index 86db8c0..10f4984 100644 --- a/{{ cookiecutter.repo_name }}/.circleci/config.yml +++ b/{{ cookiecutter.repo_name }}/.circleci/config.yml @@ -8,7 +8,8 @@ jobs: docker: # specify the version you desire here # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers` - - image: circleci/python:3.7.0 + - image: continuumio/miniconda3 + # Specify service dependencies here if necessary # CircleCI maintains a library of pre-built images @@ -20,13 +21,13 @@ jobs: steps: - checkout - - run: - name: Set up Anaconda - command: | - wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh; - chmod +x ~/miniconda.sh; - ~/miniconda.sh -b -p ~/miniconda; - echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV; + #- run: + #name: Set up Anaconda + #command: | + # wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh; + # chmod +x ~/miniconda.sh; + # ~/miniconda.sh -b -p ~/miniconda; + # echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV; - run: name: Create environment and contrive to always use it From d3cbe679218991b85466f0d86ad3af23080443fd Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Fri, 30 Dec 2022 18:49:58 -0800 Subject: [PATCH 10/36] remove comments --- {{ cookiecutter.repo_name }}/.circleci/config.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/.circleci/config.yml b/{{ cookiecutter.repo_name }}/.circleci/config.yml index 10f4984..98373ef 100644 --- a/{{ cookiecutter.repo_name }}/.circleci/config.yml +++ b/{{ cookiecutter.repo_name }}/.circleci/config.yml @@ -21,14 +21,6 @@ jobs: steps: - checkout - #- run: - #name: Set up Anaconda - #command: | - # wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh; - # chmod +x ~/miniconda.sh; - # ~/miniconda.sh -b -p ~/miniconda; - # echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV; - - run: name: Create environment and contrive to always use it command: | From 49153b1c21aab5bdd390b0abc3eb02e40f7f8f11 Mon Sep 17 00:00:00 2001 From: Amy Wooding <36967030+acwooding@users.noreply.github.com> Date: Fri, 30 Dec 2022 18:52:10 -0800 Subject: [PATCH 11/36] Updated config.yml --- .circleci/config.yml | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0e16d21..861f07f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ jobs: docker: # specify the version you desire here # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers` - - image: cimg/python:3.8.0 + - image: continuumio/miniconda3 # Specify service dependencies here if necessary # CircleCI maintains a library of pre-built images @@ -19,19 +19,7 @@ jobs: steps: - checkout - - - run: - name: Set up Anaconda - command: | - wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh; - chmod +x ~/miniconda.sh; - ~/miniconda.sh -b -p ~/miniconda; - export PATH=~/miniconda/bin:$PATH - echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV; - conda update --yes --quiet conda; - conda init bash - sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV - + - run: name: Build cookiecutter environment and test-env project command: | From 0649fc3bdb0f1d101dba5cb7d753bea082a23d7e Mon Sep 17 00:00:00 2001 From: Amy Wooding <36967030+acwooding@users.noreply.github.com> Date: Fri, 30 Dec 2022 18:58:26 -0800 Subject: [PATCH 12/36] Updated config.yml --- .circleci/config.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 861f07f..a0fd328 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -20,6 +20,14 @@ jobs: steps: - checkout + - run: + name: Set up Conda + command: | + conda init bash + conda update --yes --quiet conda; + export CONDA_EXE=/home/circleci/miniconda/bin/conda + sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV + - run: name: Build cookiecutter environment and test-env project command: | From 41907d3650d277cd790f5d4c46ce9c070f9f0a96 Mon Sep 17 00:00:00 2001 From: Amy Wooding <36967030+acwooding@users.noreply.github.com> Date: Fri, 30 Dec 2022 19:46:48 -0800 Subject: [PATCH 13/36] Updated config.yml --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a0fd328..4bd1897 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -35,7 +35,7 @@ jobs: conda activate cookiecutter pip install cookiecutter pip install ruamel.yaml - mkdir /home/circleci/.cookiecutter_replay + mkdir -p /home/circleci/.cookiecutter_replay cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json pwd cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input From 39c611116fb392767224b2c8ac567fd34ceb30b9 Mon Sep 17 00:00:00 2001 From: Amy Wooding <36967030+acwooding@users.noreply.github.com> Date: Fri, 30 Dec 2022 19:52:21 -0800 Subject: [PATCH 14/36] Updated config.yml --- .circleci/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4bd1897..95898dd 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -31,7 +31,7 @@ jobs: - run: name: Build cookiecutter environment and test-env project command: | - conda create -n cookiecutter --yes python=3.8 + conda create -n cookiecutter --yes python=3.8 make conda activate cookiecutter pip install cookiecutter pip install ruamel.yaml @@ -48,6 +48,7 @@ jobs: export CONDA_EXE=/home/circleci/miniconda/bin/conda make create_environment conda activate test-env + conda install -c anaconda make touch environment.yml make update_environment echo "conda activate test-env" >> $BASH_ENV; From 1ba37cd34e4cb1e01e15a22bc7ad28eb4319c574 Mon Sep 17 00:00:00 2001 From: Amy Wooding <36967030+acwooding@users.noreply.github.com> Date: Fri, 30 Dec 2022 19:54:29 -0800 Subject: [PATCH 15/36] Updated config.yml --- .circleci/config.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 95898dd..35b2951 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -39,8 +39,7 @@ jobs: cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json pwd cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input - conda deactivate - + - run: name: Create test-env environment and contrive to always use it command: | From 951b80673b1847b97ebefb4c3ca26464fd5694c2 Mon Sep 17 00:00:00 2001 From: Amy Wooding <36967030+acwooding@users.noreply.github.com> Date: Fri, 30 Dec 2022 19:56:11 -0800 Subject: [PATCH 16/36] Updated config.yml --- .circleci/config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 35b2951..487814b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -38,11 +38,13 @@ jobs: mkdir -p /home/circleci/.cookiecutter_replay cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json pwd + which make cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input - run: name: Create test-env environment and contrive to always use it command: | + conda activate cookiecutter cd test-env export CONDA_EXE=/home/circleci/miniconda/bin/conda make create_environment From dbe1a1af1e940a6004c0c4487a585a78a4db81bb Mon Sep 17 00:00:00 2001 From: Amy Wooding <36967030+acwooding@users.noreply.github.com> Date: Fri, 30 Dec 2022 19:59:42 -0800 Subject: [PATCH 17/36] Updated config.yml --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 487814b..a589dd5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -25,6 +25,7 @@ jobs: command: | conda init bash conda update --yes --quiet conda; + which conda export CONDA_EXE=/home/circleci/miniconda/bin/conda sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV From 9f2d43addd08a7a5d5548850a35a1661fbbb6a82 Mon Sep 17 00:00:00 2001 From: Amy Wooding <36967030+acwooding@users.noreply.github.com> Date: Fri, 30 Dec 2022 20:02:12 -0800 Subject: [PATCH 18/36] Updated config.yml --- .circleci/config.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a589dd5..6c79e63 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -25,8 +25,7 @@ jobs: command: | conda init bash conda update --yes --quiet conda; - which conda - export CONDA_EXE=/home/circleci/miniconda/bin/conda + export CONDA_EXE=/opt/conda/bin/conda sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV - run: @@ -36,8 +35,8 @@ jobs: conda activate cookiecutter pip install cookiecutter pip install ruamel.yaml - mkdir -p /home/circleci/.cookiecutter_replay - cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json + mkdir -p /root/repo/.cookiecutter_replay + cp circleci-cookiecutter-easydata.json /root/repo/.cookiecutter_replay/cookiecutter-easydata.json pwd which make cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input @@ -47,7 +46,7 @@ jobs: command: | conda activate cookiecutter cd test-env - export CONDA_EXE=/home/circleci/miniconda/bin/conda + export CONDA_EXE=/opt/conda/bin/conda make create_environment conda activate test-env conda install -c anaconda make From ea355648556d12c1518cf80f03622726a2b11e18 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Sat, 31 Dec 2022 10:28:56 -0800 Subject: [PATCH 19/36] update extra -> fileset and descr -> readme --- docs/00-xyz-sample-notebook.ipynb | 2 +- docs/Add-csv-template.ipynb | 14 +++++++------- docs/Add-derived-dataset.ipynb | 10 +++++----- docs/New-Dataset-Template.ipynb | 12 ++++++------ docs/New-Edge-Template.ipynb | 4 ++-- docs/test_docs.py | 3 +++ 6 files changed, 24 insertions(+), 21 deletions(-) diff --git a/docs/00-xyz-sample-notebook.ipynb b/docs/00-xyz-sample-notebook.ipynb index a089002..cc90381 100644 --- a/docs/00-xyz-sample-notebook.ipynb +++ b/docs/00-xyz-sample-notebook.ipynb @@ -150,7 +150,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(ds.DESCR)" + "print(ds.README)" ] }, { diff --git a/docs/Add-csv-template.ipynb b/docs/Add-csv-template.ipynb index ad69434..ad1e37d 100644 --- a/docs/Add-csv-template.ipynb +++ b/docs/Add-csv-template.ipynb @@ -83,7 +83,7 @@ "* `csv_path`: The desired path to your .csv file (in this case `epidemiology.csv`) relative to paths['raw_data_path']\n", "* `download_message`: The message to display to indicate to the user how to manually download your .csv file.\n", "* `license_str`: Information on the license for the dataset\n", - "* `descr_str`: Information on the dataset itself" + "* `readme_str`: Information on the dataset itself" ] }, { @@ -123,7 +123,7 @@ "metadata": {}, "outputs": [], "source": [ - "descr_str = \"\"\"\n", + "readme_str = \"\"\"\n", "The epidemiology table from Google's [COVID-19 Open-Data dataset](https://github.com/GoogleCloudPlatform/covid-19-open-data). \n", "\n", "The full dataset contains datasets of daily time-series data related to COVID-19 for over 20,000 distinct locations around the world. The data is at the spatial resolution of states/provinces for most regions and at county/municipality resolution for many countries such as Argentina, Brazil, Chile, Colombia, Czech Republic, Mexico, Netherlands, Peru, United Kingdom, and USA. All regions are assigned a unique location key, which resolves discrepancies between ISO / NUTS / FIPS codes, etc. The different aggregation levels are:\n", @@ -170,7 +170,7 @@ " csv_path=csv_path,\n", " download_message=download_message,\n", " license_str=license_str,\n", - " descr_str=descr_str,\n", + " readme_str=readme_str,\n", " overwrite_catalog=True)" ] }, @@ -206,9 +206,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to in `ds.EXTRA`.\n", + "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to in `ds.FILESET`.\n", "\n", - "The `.EXTRA` functionality is covered in other documentation." + "The `.FILESET` functionality is covered in other documentation." ] }, { @@ -236,7 +236,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds_raw.EXTRA" + "ds_raw.FILESET" ] }, { @@ -246,7 +246,7 @@ "outputs": [], "source": [ "# fq path to epidemiology.csv file\n", - "ds_raw.extra_file('epidemiology.csv')" + "ds_raw.fileset_file('epidemiology.csv')" ] }, { diff --git a/docs/Add-derived-dataset.ipynb b/docs/Add-derived-dataset.ipynb index e639190..d5e93e4 100644 --- a/docs/Add-derived-dataset.ipynb +++ b/docs/Add-derived-dataset.ipynb @@ -85,7 +85,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(ds.DESCR)" + "print(ds.README)" ] }, { @@ -219,7 +219,7 @@ " source_dataset_name\n", " dataset_name\n", " data_function\n", - " added_descr_txt\n", + " added_readme_txt\n", "\n", "We'll want our `data_function` to be defined in the project module (in this case `src`) for reproducibility reasons (which we've already done with `subselect_by_key` above)." ] @@ -250,7 +250,7 @@ "metadata": {}, "outputs": [], "source": [ - "added_descr_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n", + "added_readme_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n", "to the {key} dataset.\"\"\"" ] }, @@ -281,7 +281,7 @@ " source_dataset_name=source_dataset_name,\n", " dataset_name=dataset_name,\n", " data_function=data_function,\n", - " added_descr_txt=added_descr_txt,\n", + " added_readme_txt=added_readme_txt,\n", " overwrite_catalog=True)" ] }, @@ -318,7 +318,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(ds.DESCR)" + "print(ds.README)" ] }, { diff --git a/docs/New-Dataset-Template.ipynb b/docs/New-Dataset-Template.ipynb index bcf7826..abb8e88 100644 --- a/docs/New-Dataset-Template.ipynb +++ b/docs/New-Dataset-Template.ipynb @@ -167,7 +167,7 @@ "metadata": {}, "source": [ "### Create a process function\n", - "By default, we recommend that you use the `process_extra_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own." + "By default, we recommend that you use the `process_fileset_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own." ] }, { @@ -176,11 +176,11 @@ "metadata": {}, "outputs": [], "source": [ - "from src.data.extra import process_extra_files\n", - "process_function = process_extra_files\n", + "from src.data.fileset import process_fileset_files\n", + "process_function = process_fileset_files\n", "process_function_kwargs = {'file_glob':'*.csv',\n", " 'do_copy': True,\n", - " 'extra_dir': ds_name+'.extra',\n", + " 'fileset_dir': ds_name+'.fileset',\n", " 'extract_dir': ds_name}" ] }, @@ -355,7 +355,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds.EXTRA" + "ds.FILESET" ] }, { @@ -364,7 +364,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds.extra_file('epidemiology.csv')" + "ds.fileset_file('epidemiology.csv')" ] }, { diff --git a/docs/New-Edge-Template.ipynb b/docs/New-Edge-Template.ipynb index 6a1c5bb..3b1058e 100644 --- a/docs/New-Edge-Template.ipynb +++ b/docs/New-Edge-Template.ipynb @@ -88,7 +88,7 @@ "metadata": {}, "outputs": [], "source": [ - "source_ds.EXTRA" + "source_ds.FILESET" ] }, { @@ -178,7 +178,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(ds.DESCR)" + "print(ds.README)" ] }, { diff --git a/docs/test_docs.py b/docs/test_docs.py index 045cc56..2eb7922 100644 --- a/docs/test_docs.py +++ b/docs/test_docs.py @@ -9,6 +9,8 @@ import requests from src import paths +from src.log import logger + CCDS_ROOT = Path(__file__).parents[1].resolve() DOCS_DIR = CCDS_ROOT / "docs" @@ -35,6 +37,7 @@ def test_notebook_csv(self): csv_url = "https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv" csv_dest = paths['raw_data_path'] / "epidemiology.csv" if not csv_dest.exists(): + logger.DEBUG("Downloading epidemiology.csv") csv_file = requests.get(csv_url) with open(csv_dest, 'wb') as f: f.write(csv_file.content) From 0be828200d8823a1cbbe2c1254826435a30f550d Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Sat, 31 Dec 2022 11:34:02 -0800 Subject: [PATCH 20/36] change to lowercase --- docs/test_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/test_docs.py b/docs/test_docs.py index 2eb7922..7e8d17a 100644 --- a/docs/test_docs.py +++ b/docs/test_docs.py @@ -37,7 +37,7 @@ def test_notebook_csv(self): csv_url = "https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv" csv_dest = paths['raw_data_path'] / "epidemiology.csv" if not csv_dest.exists(): - logger.DEBUG("Downloading epidemiology.csv") + logger.debug("Downloading epidemiology.csv") csv_file = requests.get(csv_url) with open(csv_dest, 'wb') as f: f.write(csv_file.content) From 8920b93fc4a1edb19c200b99c274668418646dd7 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 25 Jan 2023 10:37:53 -0500 Subject: [PATCH 21/36] handle arbitraty conda channels --- {{ cookiecutter.repo_name }}/Makefile.envs | 25 +++--- .../scripts/split_pip.py | 86 ++++++++++++------- 2 files changed, 63 insertions(+), 48 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/Makefile.envs b/{{ cookiecutter.repo_name }}/Makefile.envs index 4c65eb7..5723f76 100644 --- a/{{ cookiecutter.repo_name }}/Makefile.envs +++ b/{{ cookiecutter.repo_name }}/Makefile.envs @@ -4,28 +4,22 @@ include Makefile.include -$(LOCKFILE): check_installation .make.bootstrap .make.pip-requirements.txt .make.environment-default.yml .make.conda-forge-requirements.txt +$(LOCKFILE): check_installation .make.bootstrap split_environment_files ifeq (conda, $(VIRTUALENV)) - $(CONDA_EXE) env update -n $(PROJECT_NAME) -f .make.environment-default.yml --prune - $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.conda-forge-requirements.txt --channel defaults --channel conda-forge --strict-channel-priority --yes + for channel in $(shell $(CAT) .make.channel-order.include); do\ + $(ECHO) installing from .make.$$channel-environment.txt;\ + $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.$$channel-environment.txt --channel defaults --channel $$channel --strict-channel-priority --yes ;\ + done $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture pip install -r .make.pip-requirements.txt $(CONDA_EXE) env export -n $(PROJECT_NAME) -f $(LOCKFILE) else $(error Unsupported Environment `$(VIRTUALENV)`. Use conda) endif -# extract multi-phase dependencies from environment.yml -.make.environment-pip.yml: environment.yml .make.bootstrap - $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py pip-yaml $(PROJECT_DIR)environment.yml > $@ - -.make.pip-requirements.txt: environment.yml .make.bootstrap - $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py pip $(PROJECT_DIR)environment.yml > $@ - -.make.conda-forge-requirements.txt: environment.yml .make.bootstrap - $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py conda-forge $(PROJECT_DIR)environment.yml > $@ - -.make.environment-default.yml: environment.yml .make.bootstrap - $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py default $(PROJECT_DIR)environment.yml > $@ +.PHONY: split_environment_files +## extract multi-phase dependencies from environment.yml and create ordering file +split_environment_files: environment.yml .make.bootstrap + $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py $(PROJECT_DIR)environment.yml .make.bootstrap: scripts/bootstrap.yml $(CONDA_EXE) env update -n $(PROJECT_NAME) -f scripts/bootstrap.yml @@ -69,6 +63,7 @@ endif # Checks that the conda environment is active environment_enabled: ifeq (conda,$(VIRTUALENV)) + $(CONDA_EXE) config --env --set channel_priority strict ifneq ($(notdir ${CONDA_DEFAULT_ENV}), $(PROJECT_NAME)) $(error Run "$(VIRTUALENV) activate $(PROJECT_NAME)" before proceeding...) endif diff --git a/{{ cookiecutter.repo_name }}/scripts/split_pip.py b/{{ cookiecutter.repo_name }}/scripts/split_pip.py index ecdc987..53e04fc 100644 --- a/{{ cookiecutter.repo_name }}/scripts/split_pip.py +++ b/{{ cookiecutter.repo_name }}/scripts/split_pip.py @@ -2,13 +2,19 @@ import json import sys import yaml +from collections import defaultdict -ACCEPTABLE_FORMATS = ["default", "pip", "pip-yaml", "conda-forge"] -def env_split(conda_env, kind="default"): - """Given a conda_environment dict, split into pip/nonpip versions +def env_split(conda_env, channel_order): + """Given a conda_environment dict, and a channel order, split into versions for each channel. + + Returns: + + conda_env: (list) + remaining setup bits of the environment.yml file + channel_dict: (dict) + dict containing the list of dependencies by channel name - conda_env: dict Python object corresponding to environment.yml""" # Cheater way to make deep Copies json_copy = json.dumps(conda_env) @@ -17,49 +23,63 @@ def env_split(conda_env, kind="default"): pipdeps = None deplist = conda_env.pop('dependencies') - conda_forge_list = [] + channel_dict = defaultdict(list) for k, dep in enumerate(deplist[:]): # Note: copy list, as we mutate it if isinstance(dep, dict): # nested yaml if dep.get('pip', None): - pipdeps = ["pip", deplist.pop(k)] + channel_dict['pip'] = deplist.pop(k) else: - prefix = 'conda-forge::' - if dep.startswith(prefix): - conda_forge_list.append(dep[len(prefix):]) + prefix_check = dep.split('::') + if len(prefix_check) > 1: + channel = prefix_check[0] + if not channel in channel_order: + raise Exception(f'the channel {channel} required for {dep} is not specified in a channel-order section of the environment file') + channel_dict[f'{channel}'].append(prefix_check[1]) deplist.remove(dep) - conda_env['dependencies'] = deplist - pip_env['dependencies'] = pipdeps - return conda_env, pip_env, conda_forge_list + channel_dict['defaults'] = deplist + conda_env.pop('channel-order') + return conda_env, channel_dict + +def get_channel_order(conda_env): + """ + Given a conda_environment dict, get the channels from the channel order. + """ + channel_order = conda_env.get('channel-order') + + if channel_order is None: + channel_order = ['defaults'] + if not 'defaults' in channel_order: + channel_order.insert(0, 'defaults') + channel_order.append('pip') + return channel_order def usage(): print(f""" -Usage: split_pip.py [{"|".join(ACCEPTABLE_FORMATS)}] path/to/environment.yml +Usage: split_pip.py path/to/environment.yml """) if __name__ == '__main__': - if len(sys.argv) != 3: - usage() - exit(1) - - kind = sys.argv[1] - if kind not in ACCEPTABLE_FORMATS: + if len(sys.argv) != 2: usage() exit(1) - with open(sys.argv[2], 'r') as yamlfile: + with open(sys.argv[1], 'r') as yamlfile: conda_env = yaml.safe_load(yamlfile) - cenv, penv, forgelist = env_split(conda_env) - if kind == "pip-yaml": - _ = yaml.dump(penv, sys.stdout, allow_unicode=True, default_flow_style=False) - elif kind == "pip": - print("\n".join(penv["dependencies"].pop(-1)["pip"])) - elif kind == "pip-yaml": - _ = yaml.dump(penv, sys.stdout, allow_unicode=True, default_flow_style=False) - elif kind == "default": - _ = yaml.dump(cenv, sys.stdout, allow_unicode=True, default_flow_style=False) - elif kind == "conda-forge": - print("\n".join(forgelist)) - else: - raise Exception(f"Invalid Kind: {kind}") + #check for acceptable formats + channel_order = get_channel_order(conda_env) + with open('.make.channel-order.include', 'w') as f: + f. write(' '.join(channel_order[:-1])) #exclude pip as a channel here + + cenv, channel_dict = env_split(conda_env, channel_order) + + for kind in channel_order: + if kind == "pip": + filename = '.make.pip-requirements.txt' + with open(filename, 'w') as f: + f.write("\n".join(channel_dict['pip']['pip'])) + else: + filename = f'.make.{kind}-environment.txt' + with open(filename, 'w') as f: + f.write("\n".join(channel_dict[kind])) From baa9fa2fa54e8ba5a0a03ea2fdd82019fc3ec305 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 25 Jan 2023 10:38:14 -0500 Subject: [PATCH 22/36] use the template python version --- {{ cookiecutter.repo_name }}/scripts/bootstrap.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml index c52f026..4997352 100644 --- a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml +++ b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml @@ -1,5 +1,13 @@ +{% macro pyver() -%} +{% if cookiecutter.python_version == 'latest' -%} + - python=3 +{% else -%} + - python={{ cookiecutter.python_version }} +{% endif -%} +{% endmacro -%} +name: {{ cookiecutter.repo_name }} channels: - defaults dependencies: - - python=3.7 - pyyaml +{{ pyver()|indent(2, true) }} From 09c1fd4dfa3507535822baa1bce9434fb70e446b Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 25 Jan 2023 10:51:59 -0500 Subject: [PATCH 23/36] add test of the new environment code --- .circleci/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6c79e63..3bf0b03 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -40,12 +40,13 @@ jobs: pwd which make cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input - + - run: name: Create test-env environment and contrive to always use it command: | conda activate cookiecutter cd test-env + python scripts/tests/add-extra-channel-dependency.py export CONDA_EXE=/opt/conda/bin/conda make create_environment conda activate test-env From ceb636b39eb37eb719f0915fd8dc633b270b38b4 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 25 Jan 2023 10:56:31 -0500 Subject: [PATCH 24/36] add missing file --- .../scripts/tests/add-extra-channel-dependency.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 {{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py diff --git a/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py new file mode 100644 index 0000000..c615a4e --- /dev/null +++ b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py @@ -0,0 +1,14 @@ +import sys +import yaml + + +if __name__ == "__main__": + channel_order = ['defaults', 'pytorch'] + dependency_new = "pytorch::cpuonly" + + with open("environment.yml", "rt", encoding="utf-8") as file_env: + env = yaml.safe_load(file_env) + env["dependencies"].append(dependency_new) + env["channel_order"] = channel_order + with open("environment.yml", "wt", encoding="utf-8") as file_env: + yaml.safe_dump(env, file_env) From 9739191f857dff2bc8d420b98dbe449e556df651 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 25 Jan 2023 11:08:37 -0500 Subject: [PATCH 25/36] fix indentation --- .../scripts/bootstrap.yml | 6 ++-- .../test-environment.yml | 33 +++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 {{ cookiecutter.repo_name }}/test-environment.yml diff --git a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml index 4997352..20cd12d 100644 --- a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml +++ b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml @@ -7,7 +7,7 @@ {% endmacro -%} name: {{ cookiecutter.repo_name }} channels: - - defaults + - defaults dependencies: - - pyyaml -{{ pyver()|indent(2, true) }} + - pyyaml +{{ pyver()|indent(3, true) }} diff --git a/{{ cookiecutter.repo_name }}/test-environment.yml b/{{ cookiecutter.repo_name }}/test-environment.yml new file mode 100644 index 0000000..9845bc6 --- /dev/null +++ b/{{ cookiecutter.repo_name }}/test-environment.yml @@ -0,0 +1,33 @@ +channel_order: +- defaults +- pytorch +channels: +- defaults +dependencies: +- pip +- pip: + - -e . + - python-dotenv>=0.5.1 + - nbval + - nbdime + - gdown +- setuptools +- wheel +- git>=2.5 +- sphinx +- bokeh +- click +- colorcet +- coverage +- coveralls +- matplotlib +- jupyter +- scikit-learn +- scipy +- joblib +- nb_conda_kernels +- pandas +- requests +- pathlib +- fsspec +- pytorch::cpuonly From 25e41e37a0d59fa7241e8f35807b9aefe8a0dd21 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 25 Jan 2023 11:12:04 -0500 Subject: [PATCH 26/36] fix typo and remove test file --- .../scripts/tests/add-extra-channel-dependency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py index c615a4e..8c41a6b 100644 --- a/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py +++ b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py @@ -9,6 +9,6 @@ with open("environment.yml", "rt", encoding="utf-8") as file_env: env = yaml.safe_load(file_env) env["dependencies"].append(dependency_new) - env["channel_order"] = channel_order + env["channel-order"] = channel_order with open("environment.yml", "wt", encoding="utf-8") as file_env: yaml.safe_dump(env, file_env) From f0f548134adedc03b6fdd38ffabe7478b177e25c Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 25 Jan 2023 11:25:17 -0500 Subject: [PATCH 27/36] for latest, defautl to conda latest python version and remove test file --- {{ cookiecutter.repo_name }}/environment.yml | 2 +- .../scripts/bootstrap.yml | 2 +- .../test-environment.yml | 33 ------------------- 3 files changed, 2 insertions(+), 35 deletions(-) delete mode 100644 {{ cookiecutter.repo_name }}/test-environment.yml diff --git a/{{ cookiecutter.repo_name }}/environment.yml b/{{ cookiecutter.repo_name }}/environment.yml index 6749871..5982a14 100644 --- a/{{ cookiecutter.repo_name }}/environment.yml +++ b/{{ cookiecutter.repo_name }}/environment.yml @@ -1,6 +1,6 @@ {% macro pyver() -%} {% if cookiecutter.python_version == 'latest' -%} - - python=3 + - python {% else -%} - python={{ cookiecutter.python_version }} {% endif -%} diff --git a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml index 20cd12d..d0e5cc0 100644 --- a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml +++ b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml @@ -1,6 +1,6 @@ {% macro pyver() -%} {% if cookiecutter.python_version == 'latest' -%} - - python=3 + - python {% else -%} - python={{ cookiecutter.python_version }} {% endif -%} diff --git a/{{ cookiecutter.repo_name }}/test-environment.yml b/{{ cookiecutter.repo_name }}/test-environment.yml deleted file mode 100644 index 9845bc6..0000000 --- a/{{ cookiecutter.repo_name }}/test-environment.yml +++ /dev/null @@ -1,33 +0,0 @@ -channel_order: -- defaults -- pytorch -channels: -- defaults -dependencies: -- pip -- pip: - - -e . - - python-dotenv>=0.5.1 - - nbval - - nbdime - - gdown -- setuptools -- wheel -- git>=2.5 -- sphinx -- bokeh -- click -- colorcet -- coverage -- coveralls -- matplotlib -- jupyter -- scikit-learn -- scipy -- joblib -- nb_conda_kernels -- pandas -- requests -- pathlib -- fsspec -- pytorch::cpuonly From b22f548c11593b291a2920933d5c62e2e3385a34 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 25 Jan 2023 11:41:25 -0500 Subject: [PATCH 28/36] handle situation where channel-order doesn't exist --- .circleci/config.yml | 2 +- {{ cookiecutter.repo_name }}/scripts/split_pip.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3bf0b03..788c38a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -46,9 +46,9 @@ jobs: command: | conda activate cookiecutter cd test-env - python scripts/tests/add-extra-channel-dependency.py export CONDA_EXE=/opt/conda/bin/conda make create_environment + python scripts/tests/add-extra-channel-dependency.py conda activate test-env conda install -c anaconda make touch environment.yml diff --git a/{{ cookiecutter.repo_name }}/scripts/split_pip.py b/{{ cookiecutter.repo_name }}/scripts/split_pip.py index 53e04fc..62d059c 100644 --- a/{{ cookiecutter.repo_name }}/scripts/split_pip.py +++ b/{{ cookiecutter.repo_name }}/scripts/split_pip.py @@ -39,7 +39,7 @@ def env_split(conda_env, channel_order): deplist.remove(dep) channel_dict['defaults'] = deplist - conda_env.pop('channel-order') + conda_env.pop('channel-order', None) return conda_env, channel_dict def get_channel_order(conda_env): From 756485e283d90af3cc140913bf0e9a80cf2180c0 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Mon, 30 Jan 2023 21:54:11 -0500 Subject: [PATCH 29/36] use a windows friendly loop --- {{ cookiecutter.repo_name }}/Makefile.envs | 6 ++---- {{ cookiecutter.repo_name }}/Makefile.include | 1 + {{ cookiecutter.repo_name }}/Makefile.win32 | 1 + 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/Makefile.envs b/{{ cookiecutter.repo_name }}/Makefile.envs index 5723f76..02feb89 100644 --- a/{{ cookiecutter.repo_name }}/Makefile.envs +++ b/{{ cookiecutter.repo_name }}/Makefile.envs @@ -6,10 +6,8 @@ include Makefile.include $(LOCKFILE): check_installation .make.bootstrap split_environment_files ifeq (conda, $(VIRTUALENV)) - for channel in $(shell $(CAT) .make.channel-order.include); do\ - $(ECHO) installing from .make.$$channel-environment.txt;\ - $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.$$channel-environment.txt --channel defaults --channel $$channel --strict-channel-priority --yes ;\ - done + $(foreach channel, $(shell $(CAT) .make.channel-order.include),\ + $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.$(channel)-environment.txt --channel defaults --channel $(channel) --strict-channel-priority --yes $(CMDSEP)) $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture pip install -r .make.pip-requirements.txt $(CONDA_EXE) env export -n $(PROJECT_NAME) -f $(LOCKFILE) else diff --git a/{{ cookiecutter.repo_name }}/Makefile.include b/{{ cookiecutter.repo_name }}/Makefile.include index e8486ca..fc65727 100644 --- a/{{ cookiecutter.repo_name }}/Makefile.include +++ b/{{ cookiecutter.repo_name }}/Makefile.include @@ -19,5 +19,6 @@ CAT ?= cat SET ?= export WHICH ?= which DEVNULL ?= /dev/null +CMDSEP ?= ; $(warning From here on, using SHELL = $(SHELL)) diff --git a/{{ cookiecutter.repo_name }}/Makefile.win32 b/{{ cookiecutter.repo_name }}/Makefile.win32 index 92d8800..de046eb 100644 --- a/{{ cookiecutter.repo_name }}/Makefile.win32 +++ b/{{ cookiecutter.repo_name }}/Makefile.win32 @@ -5,6 +5,7 @@ CAT = type SET = set WHICH = where DEVNULL = nul +CMDSEP = & # Some UNIXish packages force the installation of a Bourne-compatible shell, and Make # prefers using this when it sees it. We thus force the usage of the good ole Batch From 6e69de1eff50e7931978ac951916e26fdb18d776 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 1 Feb 2023 11:01:34 -0500 Subject: [PATCH 30/36] update environment management instructions to include channel order and arbitrary channel use --- .../reference/easydata/conda-environments.md | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md index 724d131..60a9a9f 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md @@ -81,6 +81,7 @@ When adding packages to your python environment, **do not `pip install` or `cond Your `environment.yml` file will look something like this: ``` name: {{ cookiecutter.repo_name }} +dependencies: - pip - pip: - -e . # conda >= 4.4 only @@ -106,7 +107,7 @@ name: {{ cookiecutter.repo_name }} ``` To add any package available from conda, add it to the end of the list. If you have a PYPI dependency that's not avaible via conda, add it to the list of pip installable dependencies under ` - pip:`. -You can include any {{ cookiecutter.upstream_location }} python-based project in the `pip` section via `git+https://{{ cookiecutter.upstream_location }}//`. +You can include any `{{ cookiecutter.upstream_location }}` python-based project in the `pip` section via `git+https://{{ cookiecutter.upstream_location }}//`. In particular, if you're working off of a fork or a work in progress branch of a repo in {{ cookiecutter.upstream_location }} (say, your personal version of ), you can change `git+https://{{ cookiecutter.upstream_location }}//` to @@ -117,6 +118,43 @@ Once you're done your edits, run `make update_environment` and voila, you're upd To share your updated environment, check in your `environment.yml` file. (More on this in [Sharing your Work](sharing-your-work.md)) +#### Adding packages from other conda channels +Say we want to add a package only available from the `conda-forge` conda channel and not the default conda channel. (The conda channel is what follows `-c` when using `conda install -c my-channel my-package`. Suppose we want to use `make` on windows. Then we need to use `conda-forge` since the default conda channel only has linux and macOS installations of `make`. To normally conda install this, we would use `conda install -c conda-forge make`. **We won't do that here**. + +Instead, we add a `channel-order` section that starts with `defaults` and lists the other channels we want to use in the order we want to install from them (note that this is a custom EasyData section to the `environment.yml`). Then we add our package in the dependency list in the form `channel-name::package-name`, for example, `conda-forge::make`. + +In this case an updated `environment.yml` file looks like this: +``` +name: {{ cookiecutter.repo_name }} +channel-order: + - defaults + - conda-forge +dependencies: + - pip + - pip: + - -e . # conda >= 4.4 only + - python-dotenv>=0.5.1 + - nbval + - nbdime + - umap-learn + - gdown + - setuptools + - wheel + - git>=2.5 # for git worktree template updating + - sphinx + - bokeh + - click + - colorcet + - coverage + - coveralls + - datashader + - holoviews + - matplotlib + - jupyter + - conda-forge::make +... +``` + #### Lock files Now, we'll admit that this workflow isn't perfectly reproducible in the sense that conda still has to resolve versions from the `environment.yml`. To make it more reproducible, running either `make create_environment` or `make update_environment` will generate an `environment.{$ARCH}.lock.yml` (e.g. `environment.i386.lock.yml`). This file keeps a record of the exact environment that is currently installed in your conda environment `{{ cookiecutter.repo_name }}`. If you ever need to reproduce an environment exactly, you can install from the `.lock.yml` file. (Note: These are architecture dependent). From 87159f708fb817b0e2b5be5004d5d6a226efedd2 Mon Sep 17 00:00:00 2001 From: Amy Wooding <36967030+acwooding@users.noreply.github.com> Date: Wed, 1 Feb 2023 11:29:02 -0500 Subject: [PATCH 31/36] Update README.md Include references to documentation and where to look for more information --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 2f2a732..7497c45 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,24 @@ python -m pip install -f requirements.txt cookiecutter https://github.com/hackalog/easydata +### To find out more +------------ +A good place to start is with reproducible environments. We have a tutorial here: [Getting Started with EasyData Environments](https://github.com/hackalog/easydata/wiki/Getting-Started-with-EasyData-Environments). + +The next place to look is in the customized documentation that is in any EasyData created repo. It is customized to the settings that you put in your template. These are reference documents that can be found under `references/easydata` that are customized to your repo that cover: + * more on conda environments + * more on paths + * git configuration (including setting up ssh with GitHub) + * git workflows + * tricks for using Jupyter notebooks in an EasyData environment + * troubleshooting + * recommendations for how to share your work + +Furthermore, see: +* [The EasyData documentation on read the docs](https://cookiecutter-easydata.readthedocs.io/en/latest/?badge=latest): this contains up-to-date working exmaples of how to use EasyData for reproducible datasets and some ways to use notebooks reproducibly +* [Talks and Tutorials based on EasyData](https://github.com/hackalog/easydata/wiki/EasyData-Talks-and-Tutorials) +* [Catalog of EasyData Documentation](https://github.com/hackalog/easydata/wiki/Catalog-of-EasyData-Documentation) +* [The EasyData wiki](https://github.com/hackalog/easydata/wiki) Check here for further troubleshooting and how-to guides for particular problems that aren't in the `references/easydata` docs (including a `git` tutorial) ### The resulting directory structure ------------ From 4b541c9df22179400154d1690c709fe0fa29c3c6 Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 1 Feb 2023 11:40:18 -0500 Subject: [PATCH 32/36] remove travis.ci testing --- .travis.yml | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b110146..0000000 --- a/.travis.yml +++ /dev/null @@ -1,51 +0,0 @@ -language: python - -cache: - directories: - - $HOME/.cache/pip - -python: - - "3.8" - -envs: - - REQUIRED_PYTHON="python3" - -install: - # install miniconda - - deactivate - - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - - MINICONDA_PATH=/home/travis/miniconda3 - - chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH - - chmod +x $MINICONDA_PATH - - export PATH=$MINICONDA_PATH/condabin:$PATH - - conda update --yes conda - # create cookiecutter environment - - conda create -n cookiecutter --yes python=3.8 - - conda init bash - - . ~/.bashrc - - conda activate cookiecutter - - pip install cookiecutter - - pip install ruamel.yaml - -script: - - pwd - # build a cookiecutter project test-env - - cookiecutter --config-file .cookiecutter-easydata-test.yml . -f --no-input - - conda deactivate - # create the environment from test-env - - cd test-env - - make create_environment - - conda activate test-env - - touch environment.yml - - make update_environment - # create test dataset - - python src/tests/make_test_datasets.py - # run tests on the src module - - export CI_RUNNING=yes - - make test_with_coverage - # test notebooks in docs - - pytest -v ../docs/test_docs.py - -after_success: - - conda activate test-env - - coveralls \ No newline at end of file From d233bfbbf03b162054aeca515d64cf9397a079c6 Mon Sep 17 00:00:00 2001 From: Kjell Wooding Date: Wed, 1 Feb 2023 11:50:55 -0500 Subject: [PATCH 33/36] fix help messages. Some of these should not display --- {{ cookiecutter.repo_name }}/Makefile | 2 +- {{ cookiecutter.repo_name }}/Makefile.envs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile index addf322..2533593 100644 --- a/{{ cookiecutter.repo_name }}/Makefile +++ b/{{ cookiecutter.repo_name }}/Makefile @@ -75,7 +75,7 @@ test: update_environment $(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \ $(MODULE_NAME) -## Run all Unit Tests with coverage +## Run all Unit and code coverage tests test_with_coverage: update_environment $(SET) LOGLEVEL=DEBUG; coverage run -m pytest --pyargs --doctest-modules --doctest-continue-on-failure --verbose \ $(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \ diff --git a/{{ cookiecutter.repo_name }}/Makefile.envs b/{{ cookiecutter.repo_name }}/Makefile.envs index 02feb89..43396df 100644 --- a/{{ cookiecutter.repo_name }}/Makefile.envs +++ b/{{ cookiecutter.repo_name }}/Makefile.envs @@ -15,7 +15,7 @@ else endif .PHONY: split_environment_files -## extract multi-phase dependencies from environment.yml and create ordering file +# extract multi-phase dependencies from environment.yml and create ordering file split_environment_files: environment.yml .make.bootstrap $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py $(PROJECT_DIR)environment.yml From 7c6b736d80fd081c55c4d1e58a19044f5d24e294 Mon Sep 17 00:00:00 2001 From: Kjell Wooding Date: Wed, 1 Feb 2023 11:51:10 -0500 Subject: [PATCH 34/36] remove lint target. We don't currently use this --- {{ cookiecutter.repo_name }}/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile index 2533593..40d2c52 100644 --- a/{{ cookiecutter.repo_name }}/Makefile +++ b/{{ cookiecutter.repo_name }}/Makefile @@ -81,11 +81,6 @@ test_with_coverage: update_environment $(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \ $(MODULE_NAME) -.PHONY: lint -## Lint using flake8 -lint: - flake8 $(MODULE_NAME) - .phony: help_update_easydata help_update_easydata: @$(PYTHON_INTERPRETER) scripts/help-update.py From 223c1fb62b9e8e149fef21ef00ff16d00e7baca1 Mon Sep 17 00:00:00 2001 From: Kjell Wooding Date: Wed, 1 Feb 2023 11:51:28 -0500 Subject: [PATCH 35/36] change this warning to a variable on the standard help page --- {{ cookiecutter.repo_name }}/Makefile | 2 +- {{ cookiecutter.repo_name }}/Makefile.include | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile index 40d2c52..15ba76e 100644 --- a/{{ cookiecutter.repo_name }}/Makefile +++ b/{{ cookiecutter.repo_name }}/Makefile @@ -100,7 +100,7 @@ debug: # Self Documenting Commands # ################################################################################# -HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM +HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM SHELL .DEFAULT_GOAL := show-help .PHONY: show-help diff --git a/{{ cookiecutter.repo_name }}/Makefile.include b/{{ cookiecutter.repo_name }}/Makefile.include index fc65727..85854ee 100644 --- a/{{ cookiecutter.repo_name }}/Makefile.include +++ b/{{ cookiecutter.repo_name }}/Makefile.include @@ -20,5 +20,3 @@ SET ?= export WHICH ?= which DEVNULL ?= /dev/null CMDSEP ?= ; - -$(warning From here on, using SHELL = $(SHELL)) From c29fed27f48d327ef722d25d8c6d1bf2829aca8f Mon Sep 17 00:00:00 2001 From: Amy Wooding Date: Wed, 1 Feb 2023 12:02:07 -0500 Subject: [PATCH 36/36] modernize the template --- cookiecutter.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cookiecutter.json b/cookiecutter.json index d411e76..cf3153e 100644 --- a/cookiecutter.json +++ b/cookiecutter.json @@ -1,12 +1,12 @@ { "project_name": "project_name", "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}", - "default_branch": ["master", "main"], + "default_branch": ["main", "master"], "module_name": "src", - "author_name": "Your name (or your organization/company/team)", + "author_name": "Your name (or the copyright holder)", "description": "A short description of this project.", "open_source_license": ["MIT", "BSD-2-Clause", "Proprietary"], - "python_version": ["3.7", "3.6", "latest", "3.8"], + "python_version": ["latest", "3.11", "3.10", "3.9", "3.8", "3.7"], "conda_path": "~/anaconda3/bin/conda", "upstream_location": ["github.com", "gitlab.com", "bitbucket.org", "your-custom-repo"] }