Merge pull request #243 from hackalog/dev

Release to Main
hackalog · Feb 1, 2023 · b2b6f21 · b2b6f21
2 parents 411a53f + ad6ead4
commit b2b6f21
Show file tree

Hide file tree

Showing 33 changed files with 725 additions and 390 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -8,7 +8,7 @@ jobs:
     docker:
       # specify the version you desire here
       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
-      - image: cimg/python:3.8.0
+      - image: continuumio/miniconda3
 
       # Specify service dependencies here if necessary
       # CircleCI maintains a library of pre-built images
@@ -19,39 +19,38 @@ jobs:
 
     steps:
       - checkout
-
+      
       - run:
-          name: Set up Anaconda
+          name: Set up Conda
           command: |
-            wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh;
-            chmod +x ~/miniconda.sh;
-            ~/miniconda.sh -b -p ~/miniconda;
-            export PATH=~/miniconda/bin:$PATH
-            echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV;
-            conda update --yes --quiet conda;
             conda init bash
-            sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV
-
+            conda update --yes --quiet conda;
+            export CONDA_EXE=/opt/conda/bin/conda
+            sed -ne '/>>> conda initialize/,/<<< conda initialize/p' ~/.bashrc >> $BASH_ENV 
+      
       - run:
           name: Build cookiecutter environment and test-env project
           command: |
-            conda create -n cookiecutter --yes python=3.8
+            conda create -n cookiecutter --yes python=3.8 make
             conda activate cookiecutter
             pip install cookiecutter
             pip install ruamel.yaml
-            mkdir /home/circleci/.cookiecutter_replay
-            cp circleci-cookiecutter-easydata.json /home/circleci/.cookiecutter_replay/cookiecutter-easydata.json
+            mkdir -p /root/repo/.cookiecutter_replay
+            cp circleci-cookiecutter-easydata.json /root/repo/.cookiecutter_replay/cookiecutter-easydata.json
             pwd
+            which make
             cookiecutter --config-file .cookiecutter-easydata-test-circleci.yml . -f --no-input
-            conda deactivate
 
       - run:
           name: Create test-env environment and contrive to always use it
           command: |
+            conda activate cookiecutter
             cd test-env
-            export CONDA_EXE=/home/circleci/miniconda/bin/conda
+            export CONDA_EXE=/opt/conda/bin/conda
             make create_environment
+            python scripts/tests/add-extra-channel-dependency.py
             conda activate test-env
+            conda install -c anaconda make
             touch environment.yml
             make update_environment
             echo "conda activate test-env" >> $BASH_ENV;

diff --git a/.travis.yml b/.travis.yml
diff --git a/README.md b/README.md
@@ -51,6 +51,24 @@ python -m pip install -f requirements.txt
 
     cookiecutter https://github.com/hackalog/easydata
 
+### To find out more
+------------
+A good place to start is with reproducible environments. We have a tutorial here: [Getting Started with EasyData Environments](https://github.com/hackalog/easydata/wiki/Getting-Started-with-EasyData-Environments). 
+
+The next place to look is in the customized documentation that is in any EasyData created repo. It is customized to the settings that you put in your template. These are reference documents that can be found under `references/easydata` that are customized to your repo that cover:
+   * more on conda environments
+   * more on paths
+   * git configuration (including setting up ssh with GitHub)
+   * git workflows
+   * tricks for using Jupyter notebooks in an EasyData environment
+   * troubleshooting
+   * recommendations for how to share your work
+
+Furthermore, see:
+* [The EasyData documentation on read the docs](https://cookiecutter-easydata.readthedocs.io/en/latest/?badge=latest): this contains up-to-date working exmaples of how to use EasyData for reproducible datasets and some ways to use notebooks reproducibly
+* [Talks and Tutorials based on EasyData](https://github.com/hackalog/easydata/wiki/EasyData-Talks-and-Tutorials)
+* [Catalog of EasyData Documentation](https://github.com/hackalog/easydata/wiki/Catalog-of-EasyData-Documentation)
+* [The EasyData wiki](https://github.com/hackalog/easydata/wiki) Check here for further troubleshooting and how-to guides for particular problems that aren't in the `references/easydata` docs (including a `git` tutorial)
 
 ### The resulting directory structure
 ------------

diff --git a/cookiecutter.json b/cookiecutter.json
@@ -1,12 +1,12 @@
 {
     "project_name": "project_name",
     "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}",
-    "default_branch": ["master", "main"],
+    "default_branch": ["main", "master"],
     "module_name": "src",
-    "author_name": "Your name (or your organization/company/team)",
+    "author_name": "Your name (or the copyright holder)",
     "description": "A short description of this project.",
     "open_source_license": ["MIT", "BSD-2-Clause", "Proprietary"],
-    "python_version": ["3.7", "3.6", "latest", "3.8"],
+    "python_version": ["latest", "3.11", "3.10", "3.9", "3.8", "3.7"],
     "conda_path": "~/anaconda3/bin/conda",
     "upstream_location": ["github.com", "gitlab.com", "bitbucket.org", "your-custom-repo"]
 }
diff --git a/docs/00-xyz-sample-notebook.ipynb b/docs/00-xyz-sample-notebook.ipynb
@@ -150,7 +150,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {

diff --git a/docs/Add-csv-template.ipynb b/docs/Add-csv-template.ipynb
@@ -83,7 +83,7 @@
     "* `csv_path`: The desired path to your .csv file (in this case `epidemiology.csv`) relative to paths['raw_data_path']\n",
     "* `download_message`: The message to display to indicate to the user how to manually download your .csv file.\n",
     "* `license_str`: Information on the license for the dataset\n",
-    "* `descr_str`: Information on the dataset itself"
+    "* `readme_str`: Information on the dataset itself"
    ]
   },
   {
@@ -123,7 +123,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "descr_str = \"\"\"\n",
+    "readme_str = \"\"\"\n",
     "The epidemiology table from Google's [COVID-19 Open-Data dataset](https://github.com/GoogleCloudPlatform/covid-19-open-data). \n",
     "\n",
     "The full dataset contains datasets of daily time-series data related to COVID-19 for over 20,000 distinct locations around the world. The data is at the spatial resolution of states/provinces for most regions and at county/municipality resolution for many countries such as Argentina, Brazil, Chile, Colombia, Czech Republic, Mexico, Netherlands, Peru, United Kingdom, and USA. All regions are assigned a unique location key, which resolves discrepancies between ISO / NUTS / FIPS codes, etc. The different aggregation levels are:\n",
@@ -170,7 +170,7 @@
     "                                               csv_path=csv_path,\n",
     "                                               download_message=download_message,\n",
     "                                               license_str=license_str,\n",
-    "                                               descr_str=descr_str,\n",
+    "                                               readme_str=readme_str,\n",
     "                                               overwrite_catalog=True)"
    ]
   },
@@ -206,9 +206,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to  in `ds.EXTRA`.\n",
+    "By default, the workflow helper function also created a `covid-19-epidemiology_raw` dataset that has an empty `ds.data`, but keeps a record of the location of the final `epidemiology.csv` file relative to  in `ds.FILESET`.\n",
     "\n",
-    "The `.EXTRA` functionality is covered in other documentation."
+    "The `.FILESET` functionality is covered in other documentation."
    ]
   },
   {
@@ -236,7 +236,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds_raw.EXTRA"
+    "ds_raw.FILESET"
    ]
   },
   {
@@ -246,7 +246,7 @@
    "outputs": [],
    "source": [
     "# fq path to epidemiology.csv file\n",
-    "ds_raw.extra_file('epidemiology.csv')"
+    "ds_raw.fileset_file('epidemiology.csv')"
    ]
   },
   {

diff --git a/docs/Add-derived-dataset.ipynb b/docs/Add-derived-dataset.ipynb
@@ -85,7 +85,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {
@@ -219,7 +219,7 @@
     "    source_dataset_name\n",
     "    dataset_name\n",
     "    data_function\n",
-    "    added_descr_txt\n",
+    "    added_readme_txt\n",
     "\n",
     "We'll want our `data_function` to be defined in the project module (in this case `src`) for reproducibility reasons (which we've already done with `subselect_by_key` above)."
    ]
@@ -250,7 +250,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "added_descr_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n",
+    "added_readme_txt = f\"\"\"The dataset {dataset_name} is the subselection \\\n",
     "to the {key} dataset.\"\"\""
    ]
   },
@@ -281,7 +281,7 @@
     "        source_dataset_name=source_dataset_name,\n",
     "        dataset_name=dataset_name,\n",
     "        data_function=data_function,\n",
-    "        added_descr_txt=added_descr_txt,\n",
+    "        added_readme_txt=added_readme_txt,\n",
     "        overwrite_catalog=True)"
    ]
   },
@@ -318,7 +318,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {

diff --git a/docs/New-Dataset-Template.ipynb b/docs/New-Dataset-Template.ipynb
@@ -167,7 +167,7 @@
    "metadata": {},
    "source": [
     "### Create a process function\n",
-    "By default, we recommend that you use the `process_extra_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own."
+    "By default, we recommend that you use the `process_fileset_files` functionality and then use a transformer function to create a derived dataset, but you can optionally create your own."
    ]
   },
   {
@@ -176,11 +176,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.data.extra import process_extra_files\n",
-    "process_function = process_extra_files\n",
+    "from src.data.fileset import process_fileset_files\n",
+    "process_function = process_fileset_files\n",
     "process_function_kwargs = {'file_glob':'*.csv',\n",
     "                           'do_copy': True,\n",
-    "                           'extra_dir': ds_name+'.extra',\n",
+    "                           'fileset_dir': ds_name+'.fileset',\n",
     "                           'extract_dir': ds_name}"
    ]
   },
@@ -355,7 +355,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds.EXTRA"
+    "ds.FILESET"
    ]
   },
   {
@@ -364,7 +364,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds.extra_file('epidemiology.csv')"
+    "ds.fileset_file('epidemiology.csv')"
    ]
   },
   {

diff --git a/docs/New-Edge-Template.ipynb b/docs/New-Edge-Template.ipynb
@@ -88,7 +88,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "source_ds.EXTRA"
+    "source_ds.FILESET"
    ]
   },
   {
@@ -178,7 +178,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(ds.DESCR)"
+    "print(ds.README)"
    ]
   },
   {

diff --git a/docs/test_docs.py b/docs/test_docs.py
@@ -9,6 +9,8 @@
 import requests
 
 from src import paths
+from src.log import logger
+
 
 CCDS_ROOT = Path(__file__).parents[1].resolve()
 DOCS_DIR = CCDS_ROOT / "docs"
@@ -35,6 +37,7 @@ def test_notebook_csv(self):
         csv_url = "https://storage.googleapis.com/covid19-open-data/v2/epidemiology.csv"
         csv_dest = paths['raw_data_path'] / "epidemiology.csv"
         if not csv_dest.exists():
+            logger.debug("Downloading epidemiology.csv")
             csv_file = requests.get(csv_url)
             with open(csv_dest, 'wb') as f:
                 f.write(csv_file.content)

diff --git a/{{ cookiecutter.repo_name }}/.circleci/config.yml b/{{ cookiecutter.repo_name }}/.circleci/config.yml
@@ -8,7 +8,8 @@ jobs:
     docker:
       # specify the version you desire here
       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
-      - image: circleci/python:3.7.0
+      - image: continuumio/miniconda3
+
 
       # Specify service dependencies here if necessary
       # CircleCI maintains a library of pre-built images
@@ -20,14 +21,6 @@ jobs:
     steps:
       - checkout
 
-      - run:
-          name: Set up Anaconda
-          command: |
-            wget -q http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh;
-            chmod +x ~/miniconda.sh;
-            ~/miniconda.sh -b -p ~/miniconda;
-            echo "export PATH=~/miniconda/bin:$PATH" >> $BASH_ENV;
-
       - run:
           name: Create environment and contrive to always use it
           command: |

diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile
@@ -75,17 +75,12 @@ test: update_environment
 		$(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \
 		$(MODULE_NAME)
 
-## Run all Unit Tests with coverage
+## Run all Unit and code coverage tests
 test_with_coverage: update_environment
 	$(SET) LOGLEVEL=DEBUG; coverage run -m pytest --pyargs --doctest-modules --doctest-continue-on-failure --verbose \
 		$(if $(CI_RUNNING),--ignore=$(TESTS_NO_CI)) \
 		$(MODULE_NAME)
 
-.PHONY: lint
-## Lint using flake8
-lint:
-	flake8 $(MODULE_NAME)
-
 .phony: help_update_easydata
 help_update_easydata:
 	@$(PYTHON_INTERPRETER) scripts/help-update.py
@@ -105,7 +100,7 @@ debug:
 # Self Documenting Commands                                                     #
 #################################################################################
 
-HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM
+HELP_VARS := PROJECT_NAME DEBUG_FILE ARCH PLATFORM SHELL
 
 .DEFAULT_GOAL := show-help
 .PHONY: show-help