diff --git a/.circleci/config.yml b/.circleci/config.yml index 0e16d21..788c38a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ jobs: docker: # specify the version you desire here # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers` - - image: cimg/python:3.8.0 + - image: continuumio/miniconda3 # Specify service dependencies here if necessary # CircleCI maintains a library of pre-built images @@ -19,39 +19,38 @@ jobs: steps: - checkout - + - run: - name: Set up Anaconda + name: Set up Conda command: | - wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh; - chmod +x ~/miniconda.sh; - ~/miniconda.sh -b -p ~/miniconda; - export PATH=~/miniconda/bin:$PATH - echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV; - conda update --yes --quiet conda; conda init bash - sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV - + conda update --yes --quiet conda; + export CONDA_EXE=/opt/conda/bin/conda + sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV + - run: name: Build cookiecutter environment and test-env project command: | - conda create -n cookiecutter --yes python=3.8 + conda create -n cookiecutter --yes python=3.8 make conda activate cookiecutter pip install cookiecutter pip install ruamel.yaml - mkdir /home/circleci/.cookiecutter_replay - cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json + mkdir -p /root/repo/.cookiecutter_replay + cp circleci-cookiecutter-easydata.json /root/repo/.cookiecutter_replay/cookiecutter-easydata.json pwd + which make cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input - conda deactivate - run: name: Create test-env environment and contrive to always use it command: | + conda activate cookiecutter cd test-env - export CONDA_EXE=/home/circleci/miniconda/bin/conda + export CONDA_EXE=/opt/conda/bin/conda make create_environment + python scripts/tests/add-extra-channel-dependency.py conda activate test-env + conda install -c anaconda make touch environment.yml make update_environment echo "conda activate test-env" >> $BASH_ENV; diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b110146..0000000 --- a/.travis.yml +++ /dev/null @@ -1,51 +0,0 @@ -language: python - -cache: - directories: - - $HOME/.cache/pip - -python: - - "3.8" - -envs: - - REQUIRED_PYTHON="python3" - -install: - # install miniconda - - deactivate - - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - - MINICONDA_PATH=/home/travis/miniconda3 - - chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH - - chmod +x $MINICONDA_PATH - - export PATH=$MINICONDA_PATH/condabin:$PATH - - conda update --yes conda - # create cookiecutter environment - - conda create -n cookiecutter --yes python=3.8 - - conda init bash - - . ~/.bashrc - - conda activate cookiecutter - - pip install cookiecutter - - pip install ruamel.yaml - -script: - - pwd - # build a cookiecutter project test-env - - cookiecutter --config-file .cookiecutter-easydata-test.yml . -f --no-input - - conda deactivate - # create the environment from test-env - - cd test-env - - make create_environment - - conda activate test-env - - touch environment.yml - - make update_environment - # create test dataset - - python src/tests/make_test_datasets.py - # run tests on the src module - - export CI_RUNNING=yes - - make test_with_coverage - # test notebooks in docs - - pytest -v ../docs/test_docs.py - -after_success: - - conda activate test-env - - coveralls \ No newline at end of file diff --git a/README.md b/README.md index 2f2a732..7497c45 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,24 @@ python -m pip install -f requirements.txt cookiecutter https://github.com/hackalog/easydata +### To find out more +------------ +A good place to start is with reproducible environments. We have a tutorial here: [Getting Started with EasyData Environments](https://github.com/hackalog/easydata/wiki/Getting-Started-with-EasyData-Environments). + +The next place to look is in the customized documentation that is in any EasyData created repo. It is customized to the settings that you put in your template. These are reference documents that can be found under `references/easydata` that are customized to your repo that cover: + * more on conda environments + * more on paths + * git configuration (including setting up ssh with GitHub) + * git workflows + * tricks for using Jupyter notebooks in an EasyData environment + * troubleshooting + * recommendations for how to share your work + +Furthermore, see: +* [The EasyData documentation on read the docs](https://cookiecutter-easydata.readthedocs.io/en/latest/?badge=latest): this contains up-to-date working exmaples of how to use EasyData for reproducible datasets and some ways to use notebooks reproducibly +* [Talks and Tutorials based on EasyData](https://github.com/hackalog/easydata/wiki/EasyData-Talks-and-Tutorials) +* [Catalog of EasyData Documentation](https://github.com/hackalog/easydata/wiki/Catalog-of-EasyData-Documentation) +* [The EasyData wiki](https://github.com/hackalog/easydata/wiki) Check here for further troubleshooting and how-to guides for particular problems that aren't in the `references/easydata` docs (including a `git` tutorial) ### The resulting directory structure ------------ diff --git a/cookiecutter.json b/cookiecutter.json index d411e76..cf3153e 100644 --- a/cookiecutter.json +++ b/cookiecutter.json @@ -1,12 +1,12 @@ { "project_name": "project_name", "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}", - "default_branch": ["master", "main"], + "default_branch": ["main", "master"], "module_name": "src", - "author_name": "Your name (or your organization/company/team)", + "author_name": "Your name (or the copyright holder)", "description": "A short description of this project.", "open_source_license": ["MIT", "BSD-2-Clause", "Proprietary"], - "python_version": ["3.7", "3.6", "latest", "3.8"], + "python_version": ["latest", "3.11", "3.10", "3.9", "3.8", "3.7"], "conda_path": "~/anaconda3/bin/conda", "upstream_location": ["github.com", "gitlab.com", "bitbucket.org", "your-custom-repo"] } diff --git a/docs/00-xyz-sample-notebook.ipynb b/docs/00-xyz-sample-notebook.ipynb index a089002..cc90381 100644 --- a/docs/00-xyz-sample-notebook.ipynb +++ b/docs/00-xyz-sample-notebook.ipynb @@ -150,7 +150,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(ds.DESCR)" + "print(ds.README)" ] }, { diff --git a/docs/Add-csv-template.ipynb b/docs/Add-csv-template.ipynb index ad69434..ad1e37d 100644 --- a/docs/Add-csv-template.ipynb +++ b/docs/Add-csv-template.ipynb @@ -83,7 +83,7 @@ "* `csv_path`: The desired path to your .csv file (in this case `epidemiology.csv`) relative to paths['raw_data_path']\n", "* `download_message`: The message to display to indicate to the user how to manually download your .csv file.\n", "* `license_str`: Information on the license for the dataset\n", - "* `descr_str`: Information on the dataset itself" + "* `readme_str`: Information on the dataset itself" ] }, { @@ -123,7 +123,7 @@ "metadata": {}, "outputs": [], "source": [ - "descr_str = \"\"\"\n", + "readme_str = \"\"\"\n", "The epidemiology table from Google's [COVID-19 Open-Data dataset](https://github.com/GoogleCloudPlatform/covid-19-open-data). \n", "\n", "The full dataset contains datasets of daily time-series data related to COVID-19 for over 20,000 distinct locations around the world. The data is at the spatial resolution of states/provinces for most regions and at county/municipality resolution for many countries such as Argentina, Brazil, Chile, Colombia, Czech Republic, Mexico, Netherlands, Peru, United Kingdom, and USA. All regions are assigned a unique location key, which resolves discrepancies between ISO / NUTS / FIPS codes, etc. The different aggregation levels are:\n", @@ -170,7 +170,7 @@ " csv_path=csv_path,\n", " download_message=download_message,\n", " license_str=license_str,\n", - " descr_str=descr_str,\n", + " readme_str=readme_str,\n", " overwrite_catalog=True)" ] }, @@ -206,9 +206,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to in `ds.EXTRA`.\n", + "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to in `ds.FILESET`.\n", "\n", - "The `.EXTRA` functionality is covered in other documentation." + "The `.FILESET` functionality is covered in other documentation." ] }, { @@ -236,7 +236,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds_raw.EXTRA" + "ds_raw.FILESET" ] }, { @@ -246,7 +246,7 @@ "outputs": [], "source": [ "# fq path to epidemiology.csv file\n", - "ds_raw.extra_file('epidemiology.csv')" + "ds_raw.fileset_file('epidemiology.csv')" ] }, { diff --git a/docs/Add-derived-dataset.ipynb b/docs/Add-derived-dataset.ipynb index e639190..d5e93e4 100644 --- a/docs/Add-derived-dataset.ipynb +++ b/docs/Add-derived-dataset.ipynb @@ -85,7 +85,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(ds.DESCR)" + "print(ds.README)" ] }, { @@ -219,7 +219,7 @@ " source_dataset_name\n", " dataset_name\n", " data_function\n", - " added_descr_txt\n", + " added_readme_txt\n", "\n", "We'll want our `data_function` to be defined in the project module (in this case `src`) for reproducibility reasons (which we've already done with `subselect_by_key` above)." ] @@ -250,7 +250,7 @@ "metadata": {}, "outputs": [], "source": [ - "added_descr_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n", + "added_readme_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n", "to the {key} dataset.\"\"\"" ] }, @@ -281,7 +281,7 @@ " source_dataset_name=source_dataset_name,\n", " dataset_name=dataset_name,\n", " data_function=data_function,\n", - " added_descr_txt=added_descr_txt,\n", + " added_readme_txt=added_readme_txt,\n", " overwrite_catalog=True)" ] }, @@ -318,7 +318,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(ds.DESCR)" + "print(ds.README)" ] }, { diff --git a/docs/New-Dataset-Template.ipynb b/docs/New-Dataset-Template.ipynb index bcf7826..abb8e88 100644 --- a/docs/New-Dataset-Template.ipynb +++ b/docs/New-Dataset-Template.ipynb @@ -167,7 +167,7 @@ "metadata": {}, "source": [ "### Create a process function\n", - "By default, we recommend that you use the `process_extra_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own." + "By default, we recommend that you use the `process_fileset_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own." ] }, { @@ -176,11 +176,11 @@ "metadata": {}, "outputs": [], "source": [ - "from src.data.extra import process_extra_files\n", - "process_function = process_extra_files\n", + "from src.data.fileset import process_fileset_files\n", + "process_function = process_fileset_files\n", "process_function_kwargs = {'file_glob':'*.csv',\n", " 'do_copy': True,\n", - " 'extra_dir': ds_name+'.extra',\n", + " 'fileset_dir': ds_name+'.fileset',\n", " 'extract_dir': ds_name}" ] }, @@ -355,7 +355,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds.EXTRA" + "ds.FILESET" ] }, { @@ -364,7 +364,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds.extra_file('epidemiology.csv')" + "ds.fileset_file('epidemiology.csv')" ] }, { diff --git a/docs/New-Edge-Template.ipynb b/docs/New-Edge-Template.ipynb index 6a1c5bb..3b1058e 100644 --- a/docs/New-Edge-Template.ipynb +++ b/docs/New-Edge-Template.ipynb @@ -88,7 +88,7 @@ "metadata": {}, "outputs": [], "source": [ - "source_ds.EXTRA" + "source_ds.FILESET" ] }, { @@ -178,7 +178,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(ds.DESCR)" + "print(ds.README)" ] }, { diff --git a/docs/test_docs.py b/docs/test_docs.py index 045cc56..7e8d17a 100644 --- a/docs/test_docs.py +++ b/docs/test_docs.py @@ -9,6 +9,8 @@ import requests from src import paths +from src.log import logger + CCDS_ROOT = Path(__file__).parents[1].resolve() DOCS_DIR = CCDS_ROOT / "docs" @@ -35,6 +37,7 @@ def test_notebook_csv(self): csv_url = "https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv" csv_dest = paths['raw_data_path'] / "epidemiology.csv" if not csv_dest.exists(): + logger.debug("Downloading epidemiology.csv") csv_file = requests.get(csv_url) with open(csv_dest, 'wb') as f: f.write(csv_file.content) diff --git a/{{ cookiecutter.repo_name }}/.circleci/config.yml b/{{ cookiecutter.repo_name }}/.circleci/config.yml index 86db8c0..98373ef 100644 --- a/{{ cookiecutter.repo_name }}/.circleci/config.yml +++ b/{{ cookiecutter.repo_name }}/.circleci/config.yml @@ -8,7 +8,8 @@ jobs: docker: # specify the version you desire here # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers` - - image: circleci/python:3.7.0 + - image: continuumio/miniconda3 + # Specify service dependencies here if necessary # CircleCI maintains a library of pre-built images @@ -20,14 +21,6 @@ jobs: steps: - checkout - - run: - name: Set up Anaconda - command: | - wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh; - chmod +x ~/miniconda.sh; - ~/miniconda.sh -b -p ~/miniconda; - echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV; - - run: name: Create environment and contrive to always use it command: | diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile index addf322..15ba76e 100644 --- a/{{ cookiecutter.repo_name }}/Makefile +++ b/{{ cookiecutter.repo_name }}/Makefile @@ -75,17 +75,12 @@ test: update_environment $(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \ $(MODULE_NAME) -## Run all Unit Tests with coverage +## Run all Unit and code coverage tests test_with_coverage: update_environment $(SET) LOGLEVEL=DEBUG; coverage run -m pytest --pyargs --doctest-modules --doctest-continue-on-failure --verbose \ $(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \ $(MODULE_NAME) -.PHONY: lint -## Lint using flake8 -lint: - flake8 $(MODULE_NAME) - .phony: help_update_easydata help_update_easydata: @$(PYTHON_INTERPRETER) scripts/help-update.py @@ -105,7 +100,7 @@ debug: # Self Documenting Commands # ################################################################################# -HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM +HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM SHELL .DEFAULT_GOAL := show-help .PHONY: show-help diff --git a/{{ cookiecutter.repo_name }}/Makefile.envs b/{{ cookiecutter.repo_name }}/Makefile.envs index 4c65eb7..43396df 100644 --- a/{{ cookiecutter.repo_name }}/Makefile.envs +++ b/{{ cookiecutter.repo_name }}/Makefile.envs @@ -4,28 +4,20 @@ include Makefile.include -$(LOCKFILE): check_installation .make.bootstrap .make.pip-requirements.txt .make.environment-default.yml .make.conda-forge-requirements.txt +$(LOCKFILE): check_installation .make.bootstrap split_environment_files ifeq (conda, $(VIRTUALENV)) - $(CONDA_EXE) env update -n $(PROJECT_NAME) -f .make.environment-default.yml --prune - $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.conda-forge-requirements.txt --channel defaults --channel conda-forge --strict-channel-priority --yes + $(foreach channel, $(shell $(CAT) .make.channel-order.include),\ + $(CONDA_EXE) install -n $(PROJECT_NAME) --file .make.$(channel)-environment.txt --channel defaults --channel $(channel) --strict-channel-priority --yes $(CMDSEP)) $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture pip install -r .make.pip-requirements.txt $(CONDA_EXE) env export -n $(PROJECT_NAME) -f $(LOCKFILE) else $(error Unsupported Environment `$(VIRTUALENV)`. Use conda) endif -# extract multi-phase dependencies from environment.yml -.make.environment-pip.yml: environment.yml .make.bootstrap - $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py pip-yaml $(PROJECT_DIR)environment.yml > $@ - -.make.pip-requirements.txt: environment.yml .make.bootstrap - $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py pip $(PROJECT_DIR)environment.yml > $@ - -.make.conda-forge-requirements.txt: environment.yml .make.bootstrap - $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py conda-forge $(PROJECT_DIR)environment.yml > $@ - -.make.environment-default.yml: environment.yml .make.bootstrap - $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py default $(PROJECT_DIR)environment.yml > $@ +.PHONY: split_environment_files +# extract multi-phase dependencies from environment.yml and create ordering file +split_environment_files: environment.yml .make.bootstrap + $(CONDA_EXE) run -n $(PROJECT_NAME) --no-capture $(PYTHON_INTERPRETER) scripts/split_pip.py $(PROJECT_DIR)environment.yml .make.bootstrap: scripts/bootstrap.yml $(CONDA_EXE) env update -n $(PROJECT_NAME) -f scripts/bootstrap.yml @@ -69,6 +61,7 @@ endif # Checks that the conda environment is active environment_enabled: ifeq (conda,$(VIRTUALENV)) + $(CONDA_EXE) config --env --set channel_priority strict ifneq ($(notdir ${CONDA_DEFAULT_ENV}), $(PROJECT_NAME)) $(error Run "$(VIRTUALENV) activate $(PROJECT_NAME)" before proceeding...) endif diff --git a/{{ cookiecutter.repo_name }}/Makefile.include b/{{ cookiecutter.repo_name }}/Makefile.include index e8486ca..85854ee 100644 --- a/{{ cookiecutter.repo_name }}/Makefile.include +++ b/{{ cookiecutter.repo_name }}/Makefile.include @@ -19,5 +19,4 @@ CAT ?= cat SET ?= export WHICH ?= which DEVNULL ?= /dev/null - -$(warning From here on, using SHELL = $(SHELL)) +CMDSEP ?= ; diff --git a/{{ cookiecutter.repo_name }}/Makefile.win32 b/{{ cookiecutter.repo_name }}/Makefile.win32 index 92d8800..de046eb 100644 --- a/{{ cookiecutter.repo_name }}/Makefile.win32 +++ b/{{ cookiecutter.repo_name }}/Makefile.win32 @@ -5,6 +5,7 @@ CAT = type SET = set WHICH = where DEVNULL = nul +CMDSEP = & # Some UNIXish packages force the installation of a Bourne-compatible shell, and Make # prefers using this when it sees it. We thus force the usage of the good ole Batch diff --git a/{{ cookiecutter.repo_name }}/environment.yml b/{{ cookiecutter.repo_name }}/environment.yml index 6749871..5982a14 100644 --- a/{{ cookiecutter.repo_name }}/environment.yml +++ b/{{ cookiecutter.repo_name }}/environment.yml @@ -1,6 +1,6 @@ {% macro pyver() -%} {% if cookiecutter.python_version == 'latest' -%} - - python=3 + - python {% else -%} - python={{ cookiecutter.python_version }} {% endif -%} diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md index e698b52..60a9a9f 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/conda-environments.md @@ -4,13 +4,19 @@ The `{{ cookiecutter.repo_name }}` repo is set up with template code to make man If you haven't yet, configure your conda environment. +**WARNING**: If you have conda-forge listed as a channel in your `.condarc` (or any other channels other than defaults), you may experience great difficulty generating reproducible conda environments. + +We recommend you remove conda-forge (and all other non-default channels) from your `.condarc` file and [set your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html). You can still use conda-forge (or any other conda channel), just specify it explicitly in your `environment.yml` by prefixing your package name with `channel-name::`; e.g. +``` + - wheel # install from the default (anaconda) channel + - pytorch::pytorch # install this from the `pytorch` channel + - conda-forge::tokenizers # install this from conda-forge +``` + ## Configuring your python environment Easydata uses conda to manage python packages installed by both conda **and pip**. ### Adjust your `.condarc` -**WARNING FOR EXISTING CONDA USERS**: If you have `conda-forge` listed as a channel in your `.condarc` (or any other channels other than `default`), **remove them**. These channels should be specified in `environment.yml` instead. - -We also recommend [setting your channel priority to 'strict'](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html) to reduce package incompatibility problems. This will be the default in conda 5.0, but in order to assure reproducibility, we need to use this behavior now. ``` conda config --set channel_priority strict @@ -26,18 +32,30 @@ conda config --prepend channels defaults conda config --prepend envs_dirs ~/.conda/envs # Store environments in local dir for JupyterHub ``` -### Fix the CONDA_EXE path -* Make note of the path to your conda binary: +#### Locating the `conda` binary +Ensure the Makefile can find your conda binary, either by setting the `CONDA_EXE` environment variable, or by modifying `Makefile.include` directly. + +First, check if `CONDA_EXE` is already set +``` + >>> export | grep CONDA_EXE + CONDA_EXE=/Users/your_username/miniconda3/bin/conda +``` + +If `CONDA_EXE` is not set, you will need to set it manually in `Makefile.include`; i.e. + +* Make note of the path to your conda binary. It should be in the `bin` subdirectory of your Anaconda (or miniconda) installation directory: ``` - $ which conda + >>> which conda # this will only work if conda is in your PATH, otherwise, verify manually ~/miniconda3/bin/conda ``` -* ensure your `CONDA_EXE` environment variable is set correctly in `Makefile.include` +* ensure your `CONDA_EXE` environment variable is set to this value; i.e. ``` - export CONDA_EXE=~/miniconda3/bin/conda + >>> export CONDA_EXE=~/miniconda3/bin/conda ``` +or edit `Makefile.include` directly. + ### Create the conda environment -* Create and switch to the virtual environment: +Create and switch to the virtual environment: ``` cd {{ cookiecutter.repo_name }} make create_environment @@ -63,6 +81,7 @@ When adding packages to your python environment, **do not `pip install` or `cond Your `environment.yml` file will look something like this: ``` name: {{ cookiecutter.repo_name }} +dependencies: - pip - pip: - -e . # conda >= 4.4 only @@ -88,7 +107,7 @@ name: {{ cookiecutter.repo_name }} ``` To add any package available from conda, add it to the end of the list. If you have a PYPI dependency that's not avaible via conda, add it to the list of pip installable dependencies under ` - pip:`. -You can include any {{ cookiecutter.upstream_location }} python-based project in the `pip` section via `git+https://{{ cookiecutter.upstream_location }}//`. +You can include any `{{ cookiecutter.upstream_location }}` python-based project in the `pip` section via `git+https://{{ cookiecutter.upstream_location }}//`. In particular, if you're working off of a fork or a work in progress branch of a repo in {{ cookiecutter.upstream_location }} (say, your personal version of ), you can change `git+https://{{ cookiecutter.upstream_location }}//` to @@ -99,6 +118,43 @@ Once you're done your edits, run `make update_environment` and voila, you're upd To share your updated environment, check in your `environment.yml` file. (More on this in [Sharing your Work](sharing-your-work.md)) +#### Adding packages from other conda channels +Say we want to add a package only available from the `conda-forge` conda channel and not the default conda channel. (The conda channel is what follows `-c` when using `conda install -c my-channel my-package`. Suppose we want to use `make` on windows. Then we need to use `conda-forge` since the default conda channel only has linux and macOS installations of `make`. To normally conda install this, we would use `conda install -c conda-forge make`. **We won't do that here**. + +Instead, we add a `channel-order` section that starts with `defaults` and lists the other channels we want to use in the order we want to install from them (note that this is a custom EasyData section to the `environment.yml`). Then we add our package in the dependency list in the form `channel-name::package-name`, for example, `conda-forge::make`. + +In this case an updated `environment.yml` file looks like this: +``` +name: {{ cookiecutter.repo_name }} +channel-order: + - defaults + - conda-forge +dependencies: + - pip + - pip: + - -e . # conda >= 4.4 only + - python-dotenv>=0.5.1 + - nbval + - nbdime + - umap-learn + - gdown + - setuptools + - wheel + - git>=2.5 # for git worktree template updating + - sphinx + - bokeh + - click + - colorcet + - coverage + - coveralls + - datashader + - holoviews + - matplotlib + - jupyter + - conda-forge::make +... +``` + #### Lock files Now, we'll admit that this workflow isn't perfectly reproducible in the sense that conda still has to resolve versions from the `environment.yml`. To make it more reproducible, running either `make create_environment` or `make update_environment` will generate an `environment.{$ARCH}.lock.yml` (e.g. `environment.i386.lock.yml`). This file keeps a record of the exact environment that is currently installed in your conda environment `{{ cookiecutter.repo_name }}`. If you ever need to reproduce an environment exactly, you can install from the `.lock.yml` file. (Note: These are architecture dependent). diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md index 5b54c1e..ff5923c 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/datasets.md @@ -3,8 +3,8 @@ ## TL;DR To get started, all you really need to know is that you can query for available datasets via ```python -from {{ cookiecutter.module_name }} import workflow -workflow.dataset_catalog() +from {{ cookiecutter.module_name }}.data import Catalog +Catalog.load("datasets") ``` and load these datasets via @@ -15,15 +15,18 @@ ds = Dataset.load(dataset_name) If you've followed the instructions from building the repo contained in the [README](../README.md), this should just work (if it doesn't, please let us know)! -You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.DESCR` and `ds.LICENSE`. +You can start using the data via `ds.data`. To find out more about the dataset you've just loaded, take a look at `ds.README` and `ds.LICENSE`. -**Warning**: some of the datasets can be quite large. If you want to store your data externally, we recommend symlinking your data directory (that is the `{{ cookiecutter.repo_name }}/data` directory) to somewhere with more room before loading your first `Dataset`. +**Disk Space Note**: sometimes datasets can be quite large. If you want to store your data externally, we recommend pointing your data directory to a new location; that is, +```python +from {{ cookiecutter.module_name }} import paths +paths["data_path"] = "/path/to/big/data/directory" +``` ## Digging Deeper It is useful to know a little bit more about how Datasets work. - ## What is a `Dataset` object? A Dataset is the fundamental object we use for turning raw data into useful datasets, reproducibly. It is like a scikit-learn-style `Bunch` object --- essentially, a dictionary with some extra magic to make it nicer to work with --- containing the following attributes: @@ -36,7 +39,7 @@ A Dataset is the fundamental object we use for turning raw data into useful data The `data` attribute can really be any processed data form that you like: sometimes it's a pandas dataframe (like with `wine_reviews_130k`), a list of tuples containing other data, (`reddit_comment_tree_graphs`), or other formats including `scipy.sparse` matrices or `igraph` graphs. The `target` (if you're using it), expects something that matches the `data` in terms of length. -For a hint as to which data format to expect, you can look at the contents of the `DESCR` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object. +For a hint as to which data format to expect, you can look at the contents of the `README` attribute, one of the many pieces of medata that are maintained as part of the `Dataset` object. This `metadata` is where things get interesting... which we'll cover on its own next. @@ -44,9 +47,9 @@ This `metadata` is where things get interesting... which we'll cover on its own The `metadata` is where the magic lives. It serves several purposes in terms of bookkeeping: * it includes `HASHES`, which **improve data reproducibility**, since what you download and process gets checked each step along the way to ensure the raw data matches what is stored in the `dataset_catalog`, -* it provides easy access to **what the data is** via the `DESCR` attribute, +* it provides easy access to **what the data is** via the `README` attribute, * it provides easy (and continual) **access to the license / usage restrictions** for the data (the `LICENSE` attribute), which helps with knowing what you can do when [Sharing your Work](sharing-your-work.md). -* it provides the **extra data manifest**, `EXTRA`, if your dataset includes around additional raw data (extra) files. +* it provides the **fileset data manifest**, `FILESET`, if your dataset includes around additional raw data (fileset) files. In short, it helps you to know what data you're working with, what you can do with it, and whether something has gone wrong. @@ -73,21 +76,19 @@ ds.metadata To access the most common metadata fields: ```python -ds.DESCR # or ds.metadata['descr'] +ds.README # or ds.metadata['readme'] ds.LICENSE # or ds.metadata['license'] ds.HASHES # or ds.metadata['hashes'] ``` ## The catalog -While we do our best to keep the documentation in [Available Datasets](docs/available-datasets.md) up-to-date with what's in the code, you can explore all of the currently available `Datasets` via the `dataset_catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded. +You can explore all of the currently available `Datasets` via the Dataset `Catalog`. The catalog keeps a record of the recipes used to generate a `Dataset` along with relevant hashes that are used to ensure the integrity of data when it's loaded. To access the catalog: ```python -from {{ cookiecutter.module_name }} import workflow -workflow.dataset_catalog(keys_only=True) +from {{ cookiecutter.module_name }}.data import Catalog +Catalog.load("datasets") ``` -If you're interested, set `keys_only=False` to see the complete contents of the metadata that is saved in the catalog. - ## Sharing your Data as a `Dataset` object In order to convert your data to a `Dataset` object, you will need to generate a catalog *recipe*, that uses a custom *function for processing your raw data*. Doing so allows us to document all the munging, pre-processing, and data verification necessary to reproducibly build the dataset. diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md index ce9e87d..50d5179 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/git-workflow.md @@ -1,5 +1,5 @@ -# The Easydata Git Workflow -Here's our suggestion for a reliable git workflow that works well in small team settings using [Easydata][cookiecutter-easydata]. +# The EasyData Git Workflow +Here's our suggestion for a reliable git workflow that works well in **small team settings**; e.g. when using [Easydata][easydata] in a group setting. ## Git configuration @@ -49,7 +49,7 @@ git merge {{cookiecutter.default_branch}} git push origin my_branch ``` -### Do I have any stale branches? +### Clean up the junk With your local `{{cookiecutter.default_branch}}`, `origin/{{cookiecutter.default_branch}}` and `upstream/{{cookiecutter.default_branch}}` all in sync, we like to clean up any old branches that are fully merged (and hence, can be deleted without data loss.) ```bash git branch --merged {{cookiecutter.default_branch}} @@ -58,15 +58,15 @@ git branch -d A really great feature of `git branch -d` is that it will refuse to remove a branch that hasn't been fully merged into another. Thus it's safe to use without any fear of data loss. -### Time to start the day +### Start the day Once you've finished all your merge tasks, you can create a clean working branch from the latest `{{cookiecutter.default_branch}}` by doing a: ```bash git checkout {{cookiecutter.default_branch}} git checkout -b new_branch_name ``` +That's it! Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue in our +[easydata issue tracker]. -That's it!. Do you have any suggestions for improvements to this workflow? Drop us a line or file an issue at -[cookiecutter-easydata]. - -[cookiecutter-easydata]: https://github.com/hackalog/cookiecutter-easydata/ \ No newline at end of file +[easydata issue tracker]: https://github.com/hackalog/easydata/issues +[easydata]: https://github.com/hackalog/easydata \ No newline at end of file diff --git a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md index f975369..270775c 100644 --- a/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md +++ b/{{ cookiecutter.repo_name }}/reference/easydata/notebooks.md @@ -79,8 +79,7 @@ output_notebook(resources=INLINE) # Source module imports from {{ cookiecutter.module_name }} import paths -from {{ cookiecutter.module_name }}.data import DataSource, Dataset -from {{ cookiecutter.module_name }} import workflow +from {{ cookiecutter.module_name }}.data import DataSource, Dataset, Catalog ``` You can also find most of these header cells in [00-xyz-sample-notebook.ipynb](../notebooks/00-xyz-sample-notebook.ipynb) @@ -97,8 +96,9 @@ There is a whole world of cell magics. These are bits of code that you can put a ### Quick References * [README](../README.md) -* [Setting up and Maintaining your Conda Environment Reproducibly](conda-environments.md) +* [Setting up and Maintaining your Conda Environment, Reproducibly](conda-environments.md) * [Getting and Using Datasets](datasets.md) +* [Specifying Paths in Easydata](paths.md) * [Using Notebooks for Analysis](notebooks.md) * [Sharing your Work](sharing-your-work.md) * [Troubleshooting Guide](troubleshooting.md) diff --git a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml index c52f026..d0e5cc0 100644 --- a/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml +++ b/{{ cookiecutter.repo_name }}/scripts/bootstrap.yml @@ -1,5 +1,13 @@ +{% macro pyver() -%} +{% if cookiecutter.python_version == 'latest' -%} + - python +{% else -%} + - python={{ cookiecutter.python_version }} +{% endif -%} +{% endmacro -%} +name: {{ cookiecutter.repo_name }} channels: - - defaults + - defaults dependencies: - - python=3.7 - - pyyaml + - pyyaml +{{ pyver()|indent(3, true) }} diff --git a/{{ cookiecutter.repo_name }}/scripts/split_pip.py b/{{ cookiecutter.repo_name }}/scripts/split_pip.py index ecdc987..62d059c 100644 --- a/{{ cookiecutter.repo_name }}/scripts/split_pip.py +++ b/{{ cookiecutter.repo_name }}/scripts/split_pip.py @@ -2,13 +2,19 @@ import json import sys import yaml +from collections import defaultdict -ACCEPTABLE_FORMATS = ["default", "pip", "pip-yaml", "conda-forge"] -def env_split(conda_env, kind="default"): - """Given a conda_environment dict, split into pip/nonpip versions +def env_split(conda_env, channel_order): + """Given a conda_environment dict, and a channel order, split into versions for each channel. + + Returns: + + conda_env: (list) + remaining setup bits of the environment.yml file + channel_dict: (dict) + dict containing the list of dependencies by channel name - conda_env: dict Python object corresponding to environment.yml""" # Cheater way to make deep Copies json_copy = json.dumps(conda_env) @@ -17,49 +23,63 @@ def env_split(conda_env, kind="default"): pipdeps = None deplist = conda_env.pop('dependencies') - conda_forge_list = [] + channel_dict = defaultdict(list) for k, dep in enumerate(deplist[:]): # Note: copy list, as we mutate it if isinstance(dep, dict): # nested yaml if dep.get('pip', None): - pipdeps = ["pip", deplist.pop(k)] + channel_dict['pip'] = deplist.pop(k) else: - prefix = 'conda-forge::' - if dep.startswith(prefix): - conda_forge_list.append(dep[len(prefix):]) + prefix_check = dep.split('::') + if len(prefix_check) > 1: + channel = prefix_check[0] + if not channel in channel_order: + raise Exception(f'the channel {channel} required for {dep} is not specified in a channel-order section of the environment file') + channel_dict[f'{channel}'].append(prefix_check[1]) deplist.remove(dep) - conda_env['dependencies'] = deplist - pip_env['dependencies'] = pipdeps - return conda_env, pip_env, conda_forge_list + channel_dict['defaults'] = deplist + conda_env.pop('channel-order', None) + return conda_env, channel_dict + +def get_channel_order(conda_env): + """ + Given a conda_environment dict, get the channels from the channel order. + """ + channel_order = conda_env.get('channel-order') + + if channel_order is None: + channel_order = ['defaults'] + if not 'defaults' in channel_order: + channel_order.insert(0, 'defaults') + channel_order.append('pip') + return channel_order def usage(): print(f""" -Usage: split_pip.py [{"|".join(ACCEPTABLE_FORMATS)}] path/to/environment.yml +Usage: split_pip.py path/to/environment.yml """) if __name__ == '__main__': - if len(sys.argv) != 3: - usage() - exit(1) - - kind = sys.argv[1] - if kind not in ACCEPTABLE_FORMATS: + if len(sys.argv) != 2: usage() exit(1) - with open(sys.argv[2], 'r') as yamlfile: + with open(sys.argv[1], 'r') as yamlfile: conda_env = yaml.safe_load(yamlfile) - cenv, penv, forgelist = env_split(conda_env) - if kind == "pip-yaml": - _ = yaml.dump(penv, sys.stdout, allow_unicode=True, default_flow_style=False) - elif kind == "pip": - print("\n".join(penv["dependencies"].pop(-1)["pip"])) - elif kind == "pip-yaml": - _ = yaml.dump(penv, sys.stdout, allow_unicode=True, default_flow_style=False) - elif kind == "default": - _ = yaml.dump(cenv, sys.stdout, allow_unicode=True, default_flow_style=False) - elif kind == "conda-forge": - print("\n".join(forgelist)) - else: - raise Exception(f"Invalid Kind: {kind}") + #check for acceptable formats + channel_order = get_channel_order(conda_env) + with open('.make.channel-order.include', 'w') as f: + f. write(' '.join(channel_order[:-1])) #exclude pip as a channel here + + cenv, channel_dict = env_split(conda_env, channel_order) + + for kind in channel_order: + if kind == "pip": + filename = '.make.pip-requirements.txt' + with open(filename, 'w') as f: + f.write("\n".join(channel_dict['pip']['pip'])) + else: + filename = f'.make.{kind}-environment.txt' + with open(filename, 'w') as f: + f.write("\n".join(channel_dict[kind])) diff --git a/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py new file mode 100644 index 0000000..8c41a6b --- /dev/null +++ b/{{ cookiecutter.repo_name }}/scripts/tests/add-extra-channel-dependency.py @@ -0,0 +1,14 @@ +import sys +import yaml + + +if __name__ == "__main__": + channel_order = ['defaults', 'pytorch'] + dependency_new = "pytorch::cpuonly" + + with open("environment.yml", "rt", encoding="utf-8") as file_env: + env = yaml.safe_load(file_env) + env["dependencies"].append(dependency_new) + env["channel-order"] = channel_order + with open("environment.yml", "wt", encoding="utf-8") as file_env: + yaml.safe_dump(env, file_env) diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py index cd3ea61..872135a 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/__init__.py @@ -14,6 +14,7 @@ 'project_path': '${catalog_path}/..', 'raw_data_path': '${data_path}/raw', 'template_path': '${project_path}/reference/templates', + 'abfs_cache': '${interim_data_path}/abfs_cache', } _catalog_file = _module_dir.parent / "catalog" / "config.ini" diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py index 1c23d32..b938c9f 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/_paths.py @@ -1,7 +1,7 @@ from .decorators import SingletonDecorator from .kvstore import KVStore from .log import logger -import pathlib +from pathlib import Path class PathStore(KVStore): """Persistent Key-Value store for project-level paths @@ -13,15 +13,16 @@ class PathStore(KVStore): By default, the project directory is the parent of the directory containing the `config_file`: - >>> b['project_path'] - PosixPath('/tmpx/project') - >>> b['data_path'] - PosixPath('/tmpx/project/data') + + >>> b['project_path'] == Path('/tmpx/project').resolve() + True + >>> b['data_path'] == Path('/tmpx/project/data').resolve() + True The `catalog_path` is set upon instantiation and is read-only: - >>> b['catalog_path'] - PosixPath('/tmpx/project/catalog') + >>> b['catalog_path'] == Path('/tmpx/project/catalog').resolve() + True >>> b['catalog_path'] = '/tmp' Traceback (most recent call last): ... @@ -30,21 +31,21 @@ class PathStore(KVStore): Changing a value changes all values that expand to contain it: >>> b['project_path'] = '/tmpy' - >>> b['project_path'] - PosixPath('/tmpy') - >>> b['data_path'] - PosixPath('/tmpy/data') + >>> b['project_path'] == Path('/tmpy').resolve() + True + >>> b['data_path'] == Path('/tmpy/data').resolve() + True We can have multiple levels of expansion: >>> b['raw_data_path'] = "${data_path}/raw" - >>> b['raw_data_path'] - PosixPath('/tmpy/data/raw') + >>> b['raw_data_path'] == Path('/tmpy/data/raw').resolve() + True >>> b['project_path'] = '/tmp3' - >>> b['data_path'] - PosixPath('/tmp3/data') - >>> b['raw_data_path'] - PosixPath('/tmp3/data/raw') + >>> b['data_path'] == Path('/tmp3/data').resolve() + True + >>> b['raw_data_path'] == Path('/tmp3/data/raw').resolve() + True """ # These keys should never be written to disk, though they may be used @@ -58,7 +59,7 @@ def __init__(self, *args, if config_file is None: self._config_file = "config.ini" else: - self._config_file = pathlib.Path(config_file) + self._config_file = Path(config_file) self._usage_warning = False super().__init__(*args, config_section=config_section, config_file=self._config_file, **kwargs) @@ -88,7 +89,7 @@ def __getitem__(self, key): if key in self._protected: return getattr(self, key) self._read() - return pathlib.Path(super().__getitem__(key)).resolve() + return Path(super().__getitem__(key)).resolve() @property def catalog_path(self): diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py index 4e7b43e..81d21fc 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/__init__.py @@ -2,4 +2,4 @@ from .datasets import * from .fetch import * from .utils import * -from .extra import * +from .fileset import * diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py index 2fa411c..7f44b47 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/datasets.py @@ -88,8 +88,10 @@ def __init__(self, catalog_file='datasets.json', **kwargs): """ - Object representing a dataset object. - Notionally compatible with scikit-learn's Bunch object + EasyData Dataset container Object. + + Contains metadata (README, LICENSE), associated file list (FILESET), and + optionally a data object. dataset_name: string (required) key to use for this dataset @@ -99,7 +101,7 @@ def __init__(self, Either classification target or label to be used. for each of the points in `data` metadata: dict - Data about the object. Key fields include `license_txt`, `descr`, and `hashes` + Data about the object. Key fields include `license`, `readme`, and `hashes` update_hashes: Boolean If True, recompute the data/target hashes in the Metadata """ @@ -118,7 +120,7 @@ def __init__(self, self['metadata']['dataset_name'] = dataset_name self['data'] = data self['target'] = target - #self['extra'] = Extra.from_dict(metadata.get('extra', None)) + #self['fileset'] = Fileset.from_dict(metadata.get('fileset', None)) data_hashes = self._generate_data_hashes() if update_hashes: @@ -153,10 +155,10 @@ def __setattr__(self, key, value): self['metadata'][key.lower()] = value elif key == 'name': self['metadata']['dataset_name'] = value - elif key in ['extra_base', 'extra_auth_kwargs']: + elif key in ['fileset_base', 'fileset_auth']: if self.name not in paths._config.sections(): paths._config.add_section(self.name) - if key == 'extra_auth_kwargs': + if key == 'fileset_auth': paths._config.set(self.name, key, json.dumps(value, sort_keys=True)) else: paths._config.set(self.name, key, value) @@ -170,7 +172,7 @@ def __delattr__(self, key): del self['metadata'][key.lower()] elif key == 'name': raise ValueError("name is mandatory") - elif key == 'extra_base': + elif key == 'fileset_base': if paths._config.has_section(self.name) and paths._config.has_option(self.name, key): paths._config.remove_option(self.name, key) paths._write() @@ -226,26 +228,67 @@ def resolve_local_config(self, key, default=None, kind="string"): raise ValueError(f"Unknown kind: {kind}") @property - def extra_base(self): - return self.resolve_local_config("extra_base", paths['processed_data_path'] / f"{self.name}.extra") + def fileset_base(self): + return self.resolve_local_config("fileset_base", paths['processed_data_path'] / f"{self.name}.fileset") @property - def extra_auth_kwargs(self): - return self.resolve_local_config("extra_auth_kwargs", "{}", kind="json") + def fileset_auth(self): + return self.resolve_local_config("fileset_auth", "{}", kind="json") + + def filesystem(self): + """Return an fsspec filesystem object associated with this fileset_base. + + If present, the kwargs specified in 'Dataset.fileset_auth' will be used to authenticate the connection. These must be valid + parameters to 'fsspec.open()' + + returns: fsspec.FileSystem object + + """ + f = fsspec.open(self.fileset_base, **self.fileset_auth) + return f.fs + + def fileset(self, dirs_only=False): + """Enumerate contents of fileset. + + Automatically prepends `fileset_base` + + Parameters:: + dirs_only: Boolean + if True, returns only directory names containing files + if False, returns files and their associated hashes + + Useful for file formats that are actually directories, like parquet + + Returns: + if dirs_only is True: + list of directories containing files in the fileset + else + tuples of filenames, hashlists for every file in the fileset + """ + eb = self.fileset_base + sep = "/" + ret = [] + for subdir, filedict in self.FILESET.items(): + if dirs_only: + ret.append(sep.join([eb, subdir])) + else: # returns all files + for f, hashlist in filedict.items(): + ret.append((sep.join([eb, subdir, f]), hashlist)) + return ret # Note: won't work because of set/setattr magic above - #@extra_base.deleter - #def extra_base(self): - # if paths._config.has_section(self.name) and paths._config.has_option(self.name, "extra_base"): - # paths._config.remove_option("extra_base") + #@fileset_base.deleter + #def fileset_base(self): + # if paths._config.has_section(self.name) and paths._config.has_option(self.name, "fileset_base"): + # paths._config.remove_option("fileset_base") # Note: Won't work because of setattr magic above - #@extra_base.setter - #def extra_base(self, val): + #@fileset_base.setter + #def fileset_base(self, val): # if self.name not in paths._config.sections(): # paths._config.add_section(self.name) - # paths._config.set(self.name, "extra_base", val) + # paths._config.set(self.name, "fileset_base", val) # paths._write() # logger.debug(f"Writing {paths._config_file}") @@ -579,22 +622,22 @@ def verify_hashes(self, hashdict=None, catalog_path=None): hashdict = c[self.name]["hashes"] return hashdict.items() <= self.metadata['hashes'].items() - def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, hash_types=['size']): + def verify_fileset(self, fileset_base=None, file_dict=None, return_filelists=False, hash_types=['size']): """ - Verify that all files listed in the metadata EXTRA dict are accessible and have good hashes. + Verify that all files listed in the metadata FILESET dict are accessible and have good hashes. Returns boolean - True if all files are accessible and have good hashes - and optional file lists. Parameters ---------- - extra_base: path or None - base for the EXTRA filenames. + fileset_base: path or None + base for the FILESET filenames. if passed as explicit parameter, this location will be used - if omitted, the dataset `extra_base` will be read (which checks the local_config, - or self.EXTRA_BASE, in that order) - file_dict: sub-dict of extra dict - if None, default to the whole extra dict + if omitted, the dataset `fileset_base` will be read (which checks the local_config, + or self.FILESET_BASE, in that order) + file_dict: sub-dict of fileset dict + if None, default to the whole fileset dict return_filelists: boolean, default False if True, returns triple (good_hashes, bad_hashes, missing_files) else, returns Boolean (all files good) @@ -617,19 +660,19 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, files that are inaccessible """ - if extra_base is None: - extra_base = self.extra_base - extra_base = pathlib.Path(extra_base) - extra_dict = self.metadata.get('extra', None) + if fileset_base is None: + fileset_base = self.fileset_base + fileset_base = pathlib.Path(fileset_base) + fileset_dict = self.metadata.get('fileset', None) if file_dict is None: - file_dict = extra_dict + file_dict = fileset_dict else: - if not (file_dict.keys() <= extra_dict.keys()): - raise ValueError(f"file_dict must be a subset of the metadata['extra'] dict") + if not (file_dict.keys() <= fileset_dict.keys()): + raise ValueError(f"file_dict must be a subset of the metadata['fileset'] dict") else: for key in file_dict.keys(): - if not (file_dict[key].items() <= extra_dict[key].items()): - raise ValueError(f"file_dict must be a subset of the metadata['extra'] dict") + if not (file_dict[key].items() <= fileset_dict[key].items()): + raise ValueError(f"file_dict must be a subset of the metadata['fileset'] dict") retval = False bad_hash = [] @@ -641,7 +684,7 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, else: for directory in file_dict.keys(): for file, meta_hash_list in file_dict[directory].items(): - path = extra_base / directory / file + path = fileset_base / directory / file rel_path = pathlib.Path(directory) / file if path.exists(): disk_hash_list = [] @@ -660,52 +703,52 @@ def verify_extra(self, extra_base=None, file_dict=None, return_filelists=False, else: return retval - def subselect_extra(self, rel_files): - """Convert a (relative) pathname to an EXTRA dict + def subselect_fileset(self, rel_files): + """Convert a (relative) pathname to an FILESET dict - Suitable for passing to verify_extra() + Suitable for passing to verify_fileset() """ - extra_dict = defaultdict(dict) + fileset_dict = defaultdict(dict) for rel_file_path in rel_files: rel_path = pathlib.Path(rel_file_path) try: - hashlist = self.EXTRA[str(rel_path.parent)][rel_path.name] + hashlist = self.FILESET[str(rel_path.parent)][rel_path.name] except KeyError: - raise NotFoundError(f"Not in EXTRA: {rel_file_path}") from None - extra_dict[str(rel_path.parent)][rel_path.name] = hashlist - return dict(extra_dict) + raise NotFoundError(f"Not in FILESET: {rel_file_path}") from None + fileset_dict[str(rel_path.parent)][rel_path.name] = hashlist + return dict(fileset_dict) - def extra_file(self, relative_path): - """Convert a relative path (relative to extra_base) to a fully qualified location + def fileset_file(self, relative_path): + """Convert a relative path (relative to fileset_base) to a fully qualified location - extra_base may be prefixed with optional protocol like `s3://` and + fileset_base may be prefixed with optional protocol like `s3://` and is suitable for passing to fsspec.open_files() Parameters ---------- relative_path: string or list - Relative filepath. Will be appended to extra_base (and an intervening '/' added as needed) - extra_base can be prefixed with a protocol like `s3://` to read from alternate filesystems. + Relative filepath. Will be appended to fileset_base (and an intervening '/' added as needed) + fileset_base can be prefixed with a protocol like `s3://` to read from alternate filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. """ - extra_base = self.extra_base - if extra_base.startswith("/"): - fqpath = str(pathlib.Path(extra_base) / relative_path) - elif extra_base.endswith('/'): - fqpath = f"{extra_base}{relative_path}" + fileset_base = self.fileset_base + if fileset_base.startswith("/"): + fqpath = str(pathlib.Path(fileset_base) / relative_path) + elif fileset_base.endswith('/'): + fqpath = f"{fileset_base}{relative_path}" else: - fqpath = f"{extra_base}/{relative_path}" + fqpath = f"{fileset_base}/{relative_path}" return fqpath - def open_extra(self, relative_path, auth_kwargs=None, **kwargs): - """Given a path (relative to extra_base), return an fsspec.OpenFile object + def open_fileset(self, relative_path, auth_kwargs=None, **kwargs): + """Given a path (relative to fileset_base), return an fsspec.OpenFile object Parameters ---------- relative_path: string or list - Relative filepath. Will be appended to extra_base (and an intervening '/' added as needed) - extra_base can be prefixed with a protocol like `s3://` to read from alternate filesystems. + Relative filepath. Will be appended to fileset_base (and an intervening '/' added as needed) + fileset_base can be prefixed with a protocol like `s3://` to read from alternate filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. auth_kwargs: dict or None @@ -717,7 +760,7 @@ def open_extra(self, relative_path, auth_kwargs=None, **kwargs): Examples -------- - >>> with ds.open_extra('2020-01-*.csv') as f: + >>> with ds.open_fileset('2020-01-*.csv') as f: ... df = pd.read_csv(f) # doctest: +SKIP Returns @@ -726,11 +769,11 @@ def open_extra(self, relative_path, auth_kwargs=None, **kwargs): be used as a single context """ if auth_kwargs is None: - auth_kwargs = self.extra_auth_kwargs + auth_kwargs = self.fileset_auth if auth_kwargs: logger.debug(f"Passing authentication information via auth_kwargs") - return fsspec.open(self.extra_file(relative_path), **auth_kwargs, **kwargs) + return fsspec.open(self.fileset_file(relative_path), **auth_kwargs, **kwargs) def dump(self, file_base=None, dump_path=None, hash_type='sha1', exists_ok=False, create_dirs=True, dump_metadata=True, update_catalog=True, @@ -867,8 +910,8 @@ def __init__(self, Value of hash used to verify file integrity file_name: string (optional) filename to use when saving file locally. If omitted, it will be inferred from url or source_file - name: string or {'DESCR', 'LICENSE'} (optional) - description of the file. of DESCR or LICENSE, will be used as metadata + name: string or {'README', 'LICENSE'} (optional) + description of the file. of README or LICENSE, will be used as metadata unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. @@ -909,14 +952,14 @@ def file_list(self): logger.warning("file_list is deprecated. Use file_dict instead") return list(self.file_dict.values()) - def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='DESCR', unpack_action='copy', force=False): + def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='README', unpack_action='copy', force=False): """Add metadata to a DataSource filename: create metadata entry from contents of this file. Relative to `metadata_path` contents: create metadata entry from this string metadata_path: (default `paths['raw_data_path']`) where to store metadata files - kind: {'DESCR', 'LICENSE'} + kind: {'README', 'LICENSE'} unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. force: boolean (default False) @@ -928,7 +971,7 @@ def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='D metadata_path = pathlib.Path(metadata_path) filename_map = { - 'DESCR': f'{self.name}.readme', + 'README': f'{self.name}.readme', 'LICENSE': f'{self.name}.license', } if kind not in filename_map: @@ -1337,7 +1380,7 @@ def process(self, return_X_y: boolean if True, returns (data, target) instead of a `Dataset` object. use_docstring: boolean - If True, the docstring of `self.process_function` is used as the Dataset DESCR text. + If True, the docstring of `self.process_function` is used as the Dataset README text. """ if not self.unpacked_: logger.debug("process() called before unpack()") @@ -1373,13 +1416,13 @@ def process(self, def default_metadata(self, use_docstring=False): """Returns default metadata derived from this DataSource - This sets the dataset_name, and fills in `license` and `descr` + This sets the dataset_name, and fills in `license` and `readme` fields if they are present, either on disk, or in the file list Parameters ---------- use_docstring: boolean - If True, the docstring of `self.process_function` is used as the Dataset DESCR text. + If True, the docstring of `self.process_function` is used as the Dataset README text. Returns ------- @@ -1388,12 +1431,12 @@ def default_metadata(self, use_docstring=False): metadata = {} optmap = { - 'DESCR': 'descr', + 'README': 'readme', 'LICENSE': 'license', } filemap = { 'license': f'{self.name}.license', - 'descr': f'{self.name}.readme' + 'readme': f'{self.name}.readme' } for key, fetch_dict in self.file_dict.items(): @@ -1406,7 +1449,7 @@ def default_metadata(self, use_docstring=False): if use_docstring: func = partial(self.process_function) fqfunc, invocation = partial_call_signature(func) - metadata['descr'] = f'Data processed by: {fqfunc}\n\n>>> ' + \ + metadata['readme'] = f'Data processed by: {fqfunc}\n\n>>> ' + \ f'{invocation}\n\n>>> help({func.func.__name__})\n\n' + \ f'{func.func.__doc__}' diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py similarity index 59% rename from {{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py rename to {{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py index 74419cb..f69aced 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/extra.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/fileset.py @@ -1,5 +1,5 @@ """ -Functions for handling "extra" data; i.e. collections of raw files associated with a Dataset +Functions for handling "fileset" data; i.e. collections of raw files associated with a Dataset """ from collections import defaultdict @@ -13,13 +13,13 @@ from ..log import logger __all__ = [ - 'process_extra_files', + 'process_fileset_files', ] -def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", extra_dir=".extra", dataset_dir=None, do_copy=False): +def process_fileset_files(*, extract_dir=None, metadata=None, unpack_dir=None, file_glob="*", fileset_dir=".fileset", dataset_dir=None, do_copy=False): """ Process unpacked raw files into its minimal dataset components (data, target, metadata). - Here, 'minimal' means `data` and `target` will be None, and `extra` will contain a + Here, 'minimal' means `data` and `target` will be None, and `fileset` will contain a file dict of files matching the specified file_glob (and their sizes). Parameters @@ -32,11 +32,11 @@ def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, fil Name of the directory of the unpacked zip file containing the raw data files. relative to unpack_dir file_glob: string - Add only files matching this glob pattern to EXTRA - extra_dir: string + Add only files matching this glob pattern to FILESET + fileset_dir: string Used in building the file_dict keys. do_copy: boolean - if True, actually copy the files. Otherwise just build EXTRA + if True, actually copy the files. Otherwise just build FILESET Returns ------- @@ -47,7 +47,7 @@ def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, fil data and target are None, metadata contains a file dict; i.e. - 'extra': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...} + 'fileset': {"path_relative_to_processed_dir_1": {"filename_1":["size:33"], "filename_2":["size:54"], ...}, ...} """ if metadata is None: metadata = {} @@ -63,14 +63,14 @@ def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, fil if extract_dir is not None: unpack_dir /= extract_dir - extra_dir = pathlib.Path(extra_dir) - extra_dir_fq = dataset_dir / extra_dir + fileset_dir = pathlib.Path(fileset_dir) + fileset_dir_fq = dataset_dir / fileset_dir logger.debug(f"Do copy: {do_copy}") if do_copy: - if extra_dir_fq.is_dir(): - logger.warning(f"Cleaning contents of {extra_dir}") - shutil.rmtree(extra_dir_fq) - logger.debug(f"Copying files to {extra_dir_fq}...") + if fileset_dir_fq.is_dir(): + logger.warning(f"Cleaning contents of {fileset_dir}") + shutil.rmtree(fileset_dir_fq) + logger.debug(f"Copying files to {fileset_dir_fq}...") file_dict = defaultdict(dict) files = sorted(list(unpack_dir.rglob(file_glob))) @@ -78,11 +78,11 @@ def process_extra_files(*, extract_dir=None, metadata=None, unpack_dir=None, fil if file.is_dir(): continue relative_path = file.relative_to(unpack_dir) - extra_path = extra_dir / relative_path - file_dict[str(extra_path.parent)][str(extra_path.name)] = [f'size:{os.path.getsize(file)}'] + fileset_path = fileset_dir / relative_path + file_dict[str(fileset_path.parent)][str(fileset_path.name)] = [f'size:{os.path.getsize(file)}'] if do_copy: - os.makedirs(dataset_dir / extra_path.parent, exist_ok=True) - shutil.copyfile(file, dataset_dir / extra_path) - metadata['extra'] = dict(file_dict) + os.makedirs(dataset_dir / fileset_path.parent, exist_ok=True) + shutil.copyfile(file, dataset_dir / fileset_path) + metadata['fileset'] = dict(file_dict) return None, None, metadata diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py index 6054735..31cbb1e 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/process_functions.py @@ -3,6 +3,7 @@ """ import pathlib +from sklearn.datasets import fetch_20newsgroups from tqdm.auto import tqdm @@ -10,4 +11,41 @@ from ..log import logger __all__ = [ + 'process_20_newsgroups' ] + +def process_20_newsgroups(*, extract_dir='20_newsgroups', + metadata=None, unpack_dir=None, + opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}): + """ + Process 20 newsgroups into (data, target, metadata) format. + + + Parameters + ---------- + unpack_dir: path + The interim parent directory the dataset files have been unpacked into. + extract_dir: str + Name of the directory of the unpacked files relative to the unpack_dir. Note that + opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"} + Options to pass to sklearn.datasets.fetch_20newsgroups. + + + Returns + ------- + A tuple: + (data, target, additional_metadata) + + """ + if metadata is None: + metadata = {} + + if unpack_dir is None: + unpack_dir = paths['interim_data_path'] + else: + unpack_dir = pathlib.Path(unpack_dir) + data_dir = unpack_dir / f"{extract_dir}" + + news = fetch_20newsgroups(**opts) + + return news.data, news.target, metadata diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py index 615a3bc..7cdf6ad 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/data/transformer_functions.py @@ -12,10 +12,11 @@ from ..utils import run_notebook __all__ = [ - 'run_notebook_transformer', 'apply_single_function', + 'copy_dataset', 'csv_to_pandas', 'new_dataset', + 'run_notebook_transformer', 'sklearn_train_test_split', 'sklearn_transform', ] @@ -163,23 +164,21 @@ def csv_to_pandas(ds_dict, *, output_map, **opts): new_ds = {} df = None for ds_name, dset in ds_dict.items(): - extra = dset.metadata.get('extra', None) - if extra is not None: - logger.debug(f"Input dataset {ds_name} has extra data. Processing...") - for rel_dir, file_dict in extra.items(): + fileset = dset.metadata.get('fileset', None) + if fileset is not None: + logger.debug(f"Input dataset {ds_name} has fileset data. Processing...") + for rel_dir, file_dict in fileset.items(): for new_dsname, csv_filename in output_map.items(): if csv_filename in file_dict: logger.debug(f"Found {csv_filename}. Creating {new_dsname} dataset") path = paths['processed_data_path'] / rel_dir / csv_filename df = pd.read_csv(path) new_metadata = dset.metadata - new_metadata.pop('extra', None) + new_metadata.pop('fileset', None) new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=df, metadata=new_metadata) return new_ds - - -def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_descr_txt, drop_extra, **opts): +def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, serialized_function, added_readme_txt, drop_fileset, **opts): """ Parameters ---------- @@ -189,12 +188,12 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali name of the dataset that the new dataset will be derived from dataset_name: name of the new dataset_catalog - added_descr_txt: Default None - new description text to be appended to the metadata descr + added_readme_txt: Default None + new description text to be appended to the metadata readme serialized_function: function (serialized by src.utils.serialize_partial) to run on .data to produce the new .data - drop_extra: boolean - drop the .extra part of the metadata + drop_fileset: boolean + drop the .fileset part of the metadata **opts: Remaining options will be ignored """ @@ -205,10 +204,10 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali ds = ds_dict.get(source_dataset_name) new_metadata = ds.metadata.copy() - new_metadata['descr'] += added_descr_txt - if drop_extra: - if new_metadata.get('extra', 0) != 0: - new_metadata.pop('extra') + new_metadata['readme'] += added_readme_txt + if drop_fileset: + if new_metadata.get('fileset', 0) != 0: + new_metadata.pop('fileset') logger.debug(f"Applying data function...") data_function=deserialize_partial(serialized_function) @@ -222,8 +221,45 @@ def apply_single_function(ds_dict, *, source_dataset_name, dataset_name, seriali new_ds[dataset_name] = Dataset(dataset_name=dataset_name, data=new_data, target=new_target, metadata=new_metadata) return new_ds +def copy_dataset(ds_dict, *, source_dataset_name, dataset_name, added_readme_txt, drop_fileset=True, **opts): + """ + Create a new dataset by copying an existing one + Parameters + ---------- + ds_dict: + input datasets. + source_dataset_name: + name of the dataset that the new dataset will be derived from + dataset_name: + name of the new dataset_catalog + added_readme_txt: Default None + new description text to be appended to the metadata readme + drop_fileset: boolean + drop the .fileset part of the metadata + **opts: + Remaining options will be ignored + """ + + new_ds = {} + + logger.debug(f"Loading {source_dataset_name}...") + ds = ds_dict.get(source_dataset_name) new_metadata = ds.metadata.copy() + new_metadata['readme'] += added_readme_txt + if drop_fileset: + if new_metadata.get('fileset', 0) != 0: + new_metadata.pop('fileset') - new_ds[new_dsname] = Dataset(dataset_name=new_dsname, data=preprocessed_corpus, metadata=new_metadata) + if drop_data: + new_data = None + else: + new_data = ds.data.copy() + + if drop_target: + new_target = None + else: + new_target = ds.target.copy() + + new_ds[dataset_name] = Dataset(dataset_name=dataset_name, data=new_data, target=new_target, metadata=new_metadata) return new_ds diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py index 1283bab..186704c 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/helpers.py @@ -1,7 +1,9 @@ ## Script common ways of adding a dataset to the workflow from functools import partial +import fsspec import pathlib +import os from .log import logger from . import paths @@ -10,14 +12,17 @@ from .data import (DataSource, Dataset, hash_file, DatasetGraph, Catalog, serialize_transformer_pipeline) from .data.transformer_functions import csv_to_pandas, new_dataset, apply_single_function, run_notebook_transformer -from .data.extra import process_extra_files +from .data.fileset import process_fileset_files from .data.utils import serialize_partial __all__ = [ - 'notebook_as_transformer', 'dataset_from_csv_manual_download', + 'dataset_from_fsurl', 'dataset_from_metadata', 'dataset_from_single_function', + 'derived_dataset', + 'metadata_from_fsspec', + 'notebook_as_transformer', ] @@ -90,7 +95,7 @@ def notebook_as_transformer(notebook_name, *, # Create a Dataset from a single csv file def dataset_from_csv_manual_download(ds_name, csv_path, download_message, - license_str, descr_str, *, hash_type='sha1', + license_str, readme_str, *, hash_type='sha1', hash_value=None, overwrite_catalog=False,): """ @@ -107,7 +112,7 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message, Hash, computed via the algorithm specified in `hash_type` license_str: str Contents of metadata license as text - descr_str: + readme_str: Contents of the metadata description as text overwrite_catalog: boolean If True, existing entries in datasets and transformers catalogs will be @@ -136,14 +141,14 @@ def dataset_from_csv_manual_download(ds_name, csv_path, download_message, hash_value=hash_value, unpack_action='copy', force=True) - dsrc.add_metadata(contents=descr_str, force=True) + dsrc.add_metadata(contents=readme_str, force=True) dsrc.add_metadata(contents=license_str, kind='LICENSE', force=True) - process_function = process_extra_files - process_function = process_extra_files + process_function = process_fileset_files + process_function = process_fileset_files process_function_kwargs = {'do_copy':True, 'file_glob':str(csv_path.name), - 'extra_dir': raw_ds_name+'.extra', + 'fileset_dir': raw_ds_name+'.fileset', 'extract_dir': raw_ds_name} dsrc.process_function = partial(process_function, **process_function_kwargs) datasource_catalog = Catalog.load('datasources') @@ -202,7 +207,7 @@ def dataset_from_metadata(dataset_name, metadata=None, overwrite_catalog=False): return ds -def dataset_from_single_function(*, source_dataset_name, dataset_name, data_function, added_descr_txt, drop_extra=True, overwrite_catalog=False): +def dataset_from_single_function(*, source_dataset_name, dataset_name, data_function, added_readme_txt, drop_fileset=True, overwrite_catalog=False): """ Create a derived dataset (dataset_name) via a single function call on .data from a previous dataset (source_dataset_name). @@ -213,8 +218,8 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func name of the dataset that the new dataset will be derived from dataset_name: name of the new dataset_catalog - added_descr_txt: Default None - new description text to be appended to the metadata descr + added_readme_txt: Default None + new description text to be appended to the metadata readme data_function: function (from src module) to run on .data to produce the new .data overwrite_catalog: boolean @@ -223,7 +228,43 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func dag = DatasetGraph(catalog_path=paths['catalog_path']) serialized_function = serialize_partial(data_function) transformers = [partial(apply_single_function, source_dataset_name=source_dataset_name, dataset_name=dataset_name, - serialized_function=serialized_function, added_descr_txt=added_descr_txt, drop_extra=drop_extra)] + serialized_function=serialized_function, added_readme_txt=added_readme_txt, drop_fileset=drop_fileset)] + dag.add_edge(input_dataset=source_dataset_name, + output_dataset=dataset_name, + transformer_pipeline=serialize_transformer_pipeline(transformers), + overwrite_catalog=overwrite_catalog) + ds = Dataset.from_catalog(dataset_name) + logger.debug(f"{dataset_name} added to catalog") + return ds + +def derived_dataset(*, dataset_name, source_dataset_name, added_readme_txt, + drop_fileset=True, drop_data=True, drop_target=False, + overwrite_catalog=False): + """ + Create a derived dataset (dataset_name) via a single function call on .data from a + previous dataset (source_dataset_name). + + Parameters + ---------- + source_dataset_name: + name of the dataset that the new dataset will be derived from + dataset_name: + name of the new dataset_catalog + added_readme_txt: Default None + new description text to be appended to the metadata readme + drop_fileset: boolean + If True, don't copy fileset data to new dataset + drop_data: boolean + If True, don't copy data to new dataset + drop_target: boolean + If True, don't copy target to new dataset + overwrite_catalog: boolean + if True, existing entries in datasets and transformers catalogs will be overwritten + """ + dag = DatasetGraph(catalog_path=paths['catalog_path']) + serialized_function = serialize_partial(data_function) + transformers = [partial(copy_dataset, source_dataset_name=source_dataset_name, dataset_name=dataset_name, + added_readme_txt=added_readme_txt, drop_fileset=drop_fileset, drop_data=drop_data, drop_target=drop_target)] dag.add_edge(input_dataset=source_dataset_name, output_dataset=dataset_name, transformer_pipeline=serialize_transformer_pipeline(transformers), @@ -231,3 +272,159 @@ def dataset_from_single_function(*, source_dataset_name, dataset_name, data_func ds = Dataset.from_catalog(dataset_name) logger.debug(f"{dataset_name} added to catalog") return ds + +def metadata_from_fsspec(fs, path, metadata=None, fileset=None): + """Create metadata, FILESET file list from fsspec URL. + + Creates a metadata dict representing a dataset + + + filenames in all uppercase are assumed to be metadata fields + + remaining files are used to populate FILESET data and have their hashes computed. + + Parameters + ---------- + fs: + fsspec.filesystem instance (already connected) + path: + relative to fs + metadata: + current contents of metadata dict. + Metadata obtained from fsurl will overwrite any similarly named fields in this dict + fileset: + Current contents of FILESET. new data will be appended. + Similarly named entries will be overwritten. + + returns metadata dict + """ + # There's a chance this should get rewritten to use 'fsspec.walk' + + if metadata is None: + metadata = {} + if fileset is None: + fileset = metadata.get('fileset', {}) + protocol = fs.protocol + dirs_done = [] + dirs = [path] + + while dirs: + dirname = dirs.pop() + rel_dirname = os.path.relpath(dirname, start=path) + dirs_done.append(dirname) + for file_info in fs.ls(dirname, detail=True): + file_type = file_info.get('type', None) + file_name = file_info['name'] + if file_type == 'directory': + dirs.append(file_name) + elif file_type == 'file': + basename = os.path.basename(os.path.normpath(file_name)) + if str.isupper(basename): + # Add to metadata + with fs.open(file_name, 'r') as fr: + contents = '\n'.join(fr.readlines()) + metadata[str.lower(basename)] = contents + else: + # add file and hash to FILESET + if protocol == "abfs": + # Cheap way to get md5 + md5_arr = file_info['content_settings']['content_md5'] + hashval = f"md5:{''.join('{:02x}'.format(x) for x in md5_arr)}" + else: + logger.warning(f"Unsupported fsspec filesystem: {fs.protocol}. Using size as hash") + hashval = f"size:{fs.size(file_name)}" + rel_path = os.path.relpath(file_info['name'], start=dirname) or "." + # fileset[rel_dirname][rel_path] = [hashval] + entry = {rel_path:[hashval]} + fileset.setdefault(rel_dirname,{}).update(entry) + else: + raise Exception(f"Unknown file type: {file_type}") + metadata["fileset"] = fileset + return metadata + + + +def dataset_from_fsurl(fsurl, + dataset_name=None, + fsspec_auth=None, + metadata=None, + fileset=None, + overwrite_catalog=True): + """Create a dataset from the contents of an fsspec URL + + 'fsurl' is assumed to be a directory/container/bucket. + + Files in this bucket with names entirely in UPPERCASE are assumed + to be textfiles and are used to populate metadata fields directly + as metadata fields (e.g. README, LICENSE) + + Other files have their hashes added to FILESET, and are included in + the FileSet (FILESET data) associated with the dataset. + + Parameters:: + + fsurl: fsspec URL + Should be a "directory", container, or "subdirectory" of said container. + dataset_name: string or None + Name to use for Dataset. + if None, name is the last component of the fsurl path + metadata: + current contents of metadata dict. + Metadata obtained from fsurl will overwrite any similarly named fields in this dict + fileset: + Current contents of FILESET. new data will be appended. + Similarly named entries will be overwritten. + overwrite_catalog: Boolean + if True, entry in Dataset catalog will be overwritten with the newly generated Dataset + + Returns:: + Dataset containing only metadata and FILESET info for all files in the specified fsspec URL. + + """ + if fsspec_auth is None: + fsspec_auth = {} + + f = fsspec.open(fsurl, **fsspec_auth) + path = f.path + if dataset_name is None: + dataset_name = os.path.basename(os.path.normpath(path)) + logger.debug(f"Inferring dataset_name from fsurl: {dataset_name}") + fs = f.fs + protocol = fs.protocol + meta = metadata_from_fsspec(fs, path, metadata=metadata, fileset=fileset) + meta['fileset_base'] = fsurl + ds = dataset_from_metadata(dataset_name, + metadata=meta, + overwrite_catalog=overwrite_catalog) + return ds + +def derived_dataset(*, dataset_name, source_dataset, added_readme_txt=None, drop_fileset=True, data=None, target=None): + """Create a dataset by copying its metadata from another dataset + + Parameters + ---------- + added_readme_txt: string + String to be appended to the end of the new dataset's README metadata + drop_fileset: boolean + if True, ignore fileset when copying metadata + data: + Will be used as contents of new dataset's `data` + target: + Will be used as contents of new dataset's `target` + dataset_name: String + new dataset name + source_dataset: Dataset + Metadata will be copied from this dataset + + Returns + ------- + new (derived) Dataset object + """ + new_metadata = ds.metadata.copy() + if added_readme_txt: + new_metadata['readme'] += added_readme_txt + if drop_fileset: + if new_metadata.get('fileset', 0) != 0: + new_metadata.pop('fileset') + if new_metadata.get('hashes', 0) != 0: + new_metadata.pop('hashes') + ds_out = Dataset(dataset_name, metadata=new_metadata, data=data, target=target, **kwargs) + return ds_out diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py index 056f497..31f55a8 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/tests/make_test_datasets.py @@ -2,15 +2,12 @@ from functools import partial from {{ cookiecutter.module_name }}.data import DataSource, Dataset, DatasetGraph, Catalog -from {{ cookiecutter.module_name }} import workflow, paths +from {{ cookiecutter.module_name }}.data.process_functions import process_20_newsgroups +from {{ cookiecutter.module_name }} import paths from {{ cookiecutter.module_name }}.log import logger # Set up a 20 newsgroups dataset -ds_name = '20_newsgroups' -output_ds_name = ds_name -dsrc = DataSource(ds_name) - license = """ Custom Academic License: "You may use this material free of charge for any educational purpose, provided attribution is given in any lectures or publications that make use of this material." As in http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.data.html. """ @@ -46,51 +43,19 @@ By default we follow the sklearn suggestion to set `remove=('headers', 'footers', 'quotes')` to avoid overfitting. """ +if __name__ =='__main__': + ds_name = '20_newsgroups' + output_ds_name = ds_name + dsrc = DataSource(ds_name) -dsrc.add_metadata(contents=metadata, force=True) -dsrc.add_metadata(contents=license, kind='LICENSE', force=True) - -def process_20_newsgroups(*, extract_dir='20_newsgroups', - metadata=None, unpack_dir=None, - opts={"subset":"all", "remove":"('headers', 'footers', 'quotes')"}): - """ - Process 20 newsgroups into (data, target, metadata) format. - - - Parameters - ---------- - unpack_dir: path - The interim parent directory the dataset files have been unpacked into. - extract_dir: str - Name of the directory of the unpacked files relative to the unpack_dir. Note that - opts: dict default {"subset":"all", "remove"="('headers', 'footers', 'quotes')"} - Options to pass to sklearn.datasets.fetch_20newsgroups. - - - Returns - ------- - A tuple: - (data, target, additional_metadata) - - """ - if metadata is None: - metadata = {} - - if unpack_dir is None: - unpack_dir = paths['interim_data_path'] - else: - unpack_dir = pathlib.Path(unpack_dir) - data_dir = unpack_dir / f"{extract_dir}" - - news = fetch_20newsgroups(**opts) - - return news.data, news.target, metadata + dsrc.add_metadata(contents=metadata, force=True) + dsrc.add_metadata(contents=license, kind='LICENSE', force=True) -process_function = process_20_newsgroups -process_kwargs = {} + process_function = process_20_newsgroups + process_kwargs = {} -dsrc.process_function = partial(process_function, **process_kwargs) -dsrc.update_catalog() + dsrc.process_function = partial(process_function, **process_kwargs) + dsrc.update_catalog() -dag = DatasetGraph() -dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True) + dag = DatasetGraph() + dag.add_source(output_dataset=output_ds_name, datasource_name=ds_name, overwrite_catalog=True) diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py index df901a9..03c6d44 100644 --- a/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py +++ b/{{ cookiecutter.repo_name }}/{{ cookiecutter.module_name }}/workflow.py @@ -1,6 +1,11 @@ -# Workflow is where we patch around API issues in between releases. -# Nothing in this file is intended to be a stable API. use at your own risk, -# as its contents will be regularly deprecated +"""A module where we temporarily smooth our way around API issues in Easydata. + +This is a place where we temporarily address UX and API issues in Easydata, usually by writing convenient wrappers around existing functionality. + +Nothing in here is intended to be a stable API, so use at your own risk, as these contents are regularly deprecated. + +""" + import sys import logging from .data import Catalog, Dataset, DataSource