From d3d39335de0d5f645afbfe4e12ac93410bdada51 Mon Sep 17 00:00:00 2001 From: Paul Brabban Date: Tue, 2 Jan 2024 09:39:29 +0000 Subject: [PATCH] Initial commit --- .env_template | 5 ++ .envs/README.md | 3 + .envs/prod.env | 3 + .envs/test.env | 3 + .github/actions/dbt_build/action.yml | 32 ++++++++++ .github/actions/setup_dbt/action.yml | 19 ++++++ .github/workflows/deploy.yml | 36 ++++++++++++ .gitignore | 15 +++++ .vscode/tasks.json | 50 ++++++++++++++++ CONTRIBUTORS.md | 77 +++++++++++++++++++++++++ LICENCE | 21 +++++++ README.md | 70 ++++++++++++++++++++++ dbt_project.yml | 49 ++++++++++++++++ macros/ensure_target_dataset_exists.sql | 14 +++++ models/README.md | 14 +++++ models/daily/package_downloads.sql | 12 ++++ models/daily/package_downloads.yml | 33 +++++++++++ models/file_downloads.sql | 9 +++ models/file_downloads.yml | 16 +++++ models/overview.md | 3 + models/sources/public_pypi.yml | 16 +++++ packages.yml | 3 + profiles.yml | 15 +++++ requirements.txt | 1 + seeds/README.md | 16 +++++ tests/README.md | 3 + 26 files changed, 538 insertions(+) create mode 100644 .env_template create mode 100644 .envs/README.md create mode 100644 .envs/prod.env create mode 100644 .envs/test.env create mode 100644 .github/actions/dbt_build/action.yml create mode 100644 .github/actions/setup_dbt/action.yml create mode 100644 .github/workflows/deploy.yml create mode 100644 .gitignore create mode 100644 .vscode/tasks.json create mode 100644 CONTRIBUTORS.md create mode 100644 LICENCE create mode 100644 README.md create mode 100644 dbt_project.yml create mode 100644 macros/ensure_target_dataset_exists.sql create mode 100644 models/README.md create mode 100644 models/daily/package_downloads.sql create mode 100644 models/daily/package_downloads.yml create mode 100644 models/file_downloads.sql create mode 100644 models/file_downloads.yml create mode 100644 models/overview.md create mode 100644 models/sources/public_pypi.yml create mode 100644 packages.yml create mode 100644 profiles.yml create mode 100644 requirements.txt create mode 100644 seeds/README.md create mode 100644 tests/README.md diff --git a/.env_template b/.env_template new file mode 100644 index 0000000..e6c0e01 --- /dev/null +++ b/.env_template @@ -0,0 +1,5 @@ +# copy this file to gitignored `.env` and set the environment for your personal workspace + +export DBT_DATASET=sandbox_your_name +export DBT_LOCATION=EU +export DBT_PROJECT=some-project-id # must be the GCP project id, not the project name! \ No newline at end of file diff --git a/.envs/README.md b/.envs/README.md new file mode 100644 index 0000000..0fa6190 --- /dev/null +++ b/.envs/README.md @@ -0,0 +1,3 @@ +This directory contains environment-specific configurations for use in pipeline deployment. + +Example to follow... \ No newline at end of file diff --git a/.envs/prod.env b/.envs/prod.env new file mode 100644 index 0000000..2745ba2 --- /dev/null +++ b/.envs/prod.env @@ -0,0 +1,3 @@ +export DBT_DATASET=pypi +export DBT_LOCATION=US +export DBT_PROJECT=pypi-408816 \ No newline at end of file diff --git a/.envs/test.env b/.envs/test.env new file mode 100644 index 0000000..e5d8662 --- /dev/null +++ b/.envs/test.env @@ -0,0 +1,3 @@ +export DBT_DATASET=pypi_test +export DBT_LOCATION=US +export DBT_PROJECT=pypi-408816 \ No newline at end of file diff --git a/.github/actions/dbt_build/action.yml b/.github/actions/dbt_build/action.yml new file mode 100644 index 0000000..a40d273 --- /dev/null +++ b/.github/actions/dbt_build/action.yml @@ -0,0 +1,32 @@ + +name: dbt build in venv +description: Runs dbt build from venv +inputs: + env: + required: true + description: Environment file to source +runs: + using: composite + steps: + - name: dbt build for ${{ inputs.env }} + shell: bash + run: | + source .venv/bin/activate + source .envs/${{ inputs.env }}.env + rm -rf logs + dbt clean + dbt deps + dbt debug + dbt run + echo "dbt test goes here" + dbt docs generate + - name: upload target artifacts + uses: actions/upload-artifact@v3 + with: + name: dbt_artifacts_${{ inputs.env }} + path: | + target + logs + + + diff --git a/.github/actions/setup_dbt/action.yml b/.github/actions/setup_dbt/action.yml new file mode 100644 index 0000000..12b7cf8 --- /dev/null +++ b/.github/actions/setup_dbt/action.yml @@ -0,0 +1,19 @@ + +name: Setup DBT in virtualenv +description: Sets up environment suitable for DBT +runs: + using: composite + steps: + - uses: actions/setup-python@v5 + with: + python-version: '3.11' # dbt does not support 3.12 yet + check-latest: true + - name: setup-python-venv + shell: bash + run: | + python --version + python -m venv .venv + source .venv/bin/activate + pip install -U pip + pip install -U -r requirements.txt + diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..def046e --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,36 @@ +name: deploy-to-gcp +on: + push: {} +jobs: + deploy: + runs-on: ubuntu-latest + env: + PIP_REQUIRE_VIRTUALENV: true + permissions: + contents: read + id-token: write + actions: read + pages: write + steps: + - uses: actions/checkout@v4 + with: + base-ref: ref + - uses: ./.github/actions/setup_dbt + - uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + - uses: google-github-actions/setup-gcloud@v2 + with: + version: '>= 363.0.0' + - uses: ./.github/actions/dbt_build + with: + env: test + - uses: ./.github/actions/dbt_build + with: + env: prod + - uses: actions/upload-pages-artifact@v3 + with: + path: target + - uses: actions/deploy-pages@v4 + \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7c80b7f --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +# Python virtualenv files +.venv/ + +# User's environment settings +.env + +# DBT logs +logs/ + +# DBT target dir +target/ + +# DBT packages +dbt_packages/ +package-lock.yml \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..1c9019b --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,50 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=733558 + // for the documentation about the tasks.json format + "version": "2.0.0", + "tasks": [ + { + "label": "init_venv", + "type": "shell", + "command": "python", + "args": ["-m", "venv", ".venv"] + }, + { + "label": "ensure_pip_version", + "type": "shell", + "command": "pip", + "args": ["install", "--upgrade", "pip"], + "dependsOn": ["init_venv"] + }, + { + "label": "ensure_python_deps_updated", + "type": "shell", + "command": "pip", + "args": ["install", "-U", "-r", "${workspaceFolder}/requirements.txt"], + "dependsOn": ["init_venv"] + }, + { + "label": "load_user_env", + "type": "shell", + "command": ". ${workspaceFolder}/.env" + }, + { + "label": "ensure_dbt_packages_updated", + "type": "shell", + "command": "dbt", + "args": ["deps", "--upgrade"], + "dependsOn": ["ensure_python_deps_updated", "load_user_env"] + }, + { + "label": "ensure_updated", + "dependsOn": [ + "ensure_pip_version", + "ensure_python_deps_updated", + "ensure_dbt_packages_updated" + ], + "runOptions": { + "runOn": "folderOpen" + } + } + ] + } \ No newline at end of file diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 0000000..8c77682 --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,77 @@ +# Contributing + +When contributing to this repository, please first discuss the change you wish to make via issue, +email, or any other method with the owners of this repository before making a change. + +Please note we have a code of conduct, please follow it in all your interactions with the project. + +## Pull Request Process + +A pull request process will be agreed with the first contributor. + +## Code of Conduct + +### Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, gender identity and expression, level of experience, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +### Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +### Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +### Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +### Enforcement + +Maintainers will monitor the project for breaches of code of conduct. +An enforcement policy will be set up should the need arise. + +### Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ \ No newline at end of file diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..c0b9039 --- /dev/null +++ b/LICENCE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Paul Brabban + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..2f1bf69 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +A candidate template for a standalone dbt-core/dbt-bigquery based repository. + +Thanks to [Equal Experts](https://equalexperts.com) for supporting this work. + +dbt docs automatically published on deployment at https://brabster.github.io/dbt_bigquery_template/ + +# Pre-Reqs + +- Python == 3.11 (see https://docs.getdbt.com/faqs/Core/install-python-compatibility) +- [RECOMMENDED] VSCode to use built-in tasks +- Access to GCP Project enabled for BigQuery +- [RECOMMENDED] set environment variable `PIP_REQUIRE_VIRTUALENV=true` + - Prevents accidentally installing to your system Python installation (if you have permissions to do so) + +# Setup + +- open the terminal + - `Terminal` - `New Terminal` +- update .env with appropriate values + - note project ID not project name (manifests as 404 error) + - `. .env` to update values in use in terminal +- get credentials + - if no valid credential, then error message says default credentials not found + - must be application default credential + - `gcloud auth application-default login` +- `dbt debug` should now succeed and list settings/versions + - if `dbt` is not found, you may need to enter your venv at the terminal + - `. .venv/bin/activate` (`. .venv/Scripts/activate` on Windows/Git-Bash) + +# Assumptions + +This repo is setup based on assumptions of specific ways of working that I have found to work well. +I'll try and describe them here. + +The aim is to apply tried and tested practices that I generally refer to as "engineering" to analytics, so that trust and value can develop. +The following set of principles help explain the choices in this repo structure. + +## Data-as-a-Product + +Whilst this repo can be used for ad-hoc exploration, it's intended to support a shared set of data that consumers can influence and then build on with confidence. + +## You Build It You Run It + +A team is responsible for actively developing the data product this repository describes. That team is responsible for operating the product, resolving issues, and maintaining appropriate stability and robustness to build trust with consumers. + +## Trunk-Based Development + +There is a `main` branch, which is the current version of the data product. This is the only long-lived branch, and will persist from creation of the repository until it is decommissioned. Engineers will branch from `main` to implement a change, then a Pull Request process with appropriate approvals will control the merge of that change back to `main` as the next iteration of the data product. + +## Developer Sandbox Datasets + +In order to develop in a branching style without risk of collision between different work-in-progress, engineers will need a sandbox dataset to work in. I've found that personal sandboxes in the same project as `main` is a simple approach that works well. +This repo assumes that developers will have such a sandbox (or will have permissions to create one, see `on-run-start` hook in [dbt_project.yml](dbt_project.yml)) and have set their local, personal `.env` variables to refer to it. + +## Always Up-To-Date + +There are several supply chains providing dependencies for this repo. When developing interactively, important sources are: + +- Your Python runtime, including the venv module +- `pip` package manager in the virtualenv +- Python packages via PyPI +- dbt packages + +Aside from the Python runtime which must be present to bootstrap the repo, these sources are set by default to update automatically to the latest available versions. A VSCode task is included to automatically update your local environment, and the CI system will update to latest on each run. + +I believe this setup minimises the risk related to software dependencies that users of this template are exposed to by default. + +## Self-Contained and Self-Describing + +The repo aims to be as self-contained as possible, minimising what's needed in an engineer's development environment, and making the CI setup as similar as possible to that of the engineer's environment. \ No newline at end of file diff --git a/dbt_project.yml b/dbt_project.yml new file mode 100644 index 0000000..866b45b --- /dev/null +++ b/dbt_project.yml @@ -0,0 +1,49 @@ +# there's a bunch of metadata in here that might only really make sense in dbt cloud +# In the deployments I've been involved with I've found that there's no discernable value in setting these values. +# As such I set these parameters to generic values that line everything up in a given repo. +name: 'product' # only referred to in this config file +version: '1.0.0' +config-version: 2 + +# This setting configures which "profile" dbt uses for this project. +profile: 'current' + +# These configurations specify where dbt should look for different types of files. +# The `model-paths` config, for example, states that models in this project can be +# found in the "models/" directory. You probably won't need to change these! +model-paths: ["models"] +analysis-paths: ["analyses"] +test-paths: ["tests"] +seed-paths: ["seeds"] +macro-paths: ["macros"] +snapshot-paths: ["snapshots"] + +clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_packages" + +# can document seeds if present +# seeds: +# +persist_docs: +# relation: true +# columns: true + +models: + +grant_access_to: + - project: '{{ target.project }}' + dataset: '{{ target.schema }}' + +persist_docs: + # push any model/column descriptions to the target database + relation: true + columns: true + +tests: + product: + daily: + # tests in the daily folder should have this clause appended to their where clause + # used here to limit the data scanned by the query + # note the predicate pushdown to the underlying timestamp column in the source data + +where: download_date BETWEEN DATE_SUB(CURRENT_DATE, INTERVAL 4 DAY) AND DATE_SUB(CURRENT_DATE, INTERVAL 1 DAY) + +on-run-start: + - "{{ ensure_target_dataset_exists() }}" # may or may not be appropriate for your environmental constraints \ No newline at end of file diff --git a/macros/ensure_target_dataset_exists.sql b/macros/ensure_target_dataset_exists.sql new file mode 100644 index 0000000..e4d7916 --- /dev/null +++ b/macros/ensure_target_dataset_exists.sql @@ -0,0 +1,14 @@ +{% macro ensure_target_dataset_exists() %} + + {% set project_id = target.project %} + {% set dataset_name = target.schema %} + {% set dataset_location = target.location %} + + {% do log("Ensuring dataset " ~ project_id ~ "." ~ dataset_name ~ " exists in location " ~ dataset_location ) %} + + CREATE SCHEMA IF NOT EXISTS `{{ project_id }}`.`{{ dataset_name }}` + OPTIONS ( + location = '{{ dataset_location }}' + ) + +{% endmacro %} \ No newline at end of file diff --git a/models/README.md b/models/README.md new file mode 100644 index 0000000..1760531 --- /dev/null +++ b/models/README.md @@ -0,0 +1,14 @@ +You can put multiple models and sources into the same `.yml`, as is done in [the jaffle shop example project](https://github.com/dbt-labs/jaffle_shop/blob/main/models/schema.yml). + +I have found that this quickly becomes difficult to manage. +The approach demonstrated here splits out each model into its own `.yml` config file, named for the associated `.sql` file. + + +This way, even when you have tens of models: +- it's easy to check that every model has an associated config +- it's easy to find the config associated with a `.sql` file as it will be immediately below in the file listing + +# overview.md + +You can set a custom overview for your project that appears in your dbt docs site. +See https://docs.getdbt.com/docs/collaborate/documentation#setting-a-custom-overview \ No newline at end of file diff --git a/models/daily/package_downloads.sql b/models/daily/package_downloads.sql new file mode 100644 index 0000000..c2d71a6 --- /dev/null +++ b/models/daily/package_downloads.sql @@ -0,0 +1,12 @@ +SELECT + download_date, + package, + package_version, + COUNT(1) AS download_count +FROM {{ ref('file_downloads') }} +GROUP BY + download_date, + package, + package_version + + diff --git a/models/daily/package_downloads.yml b/models/daily/package_downloads.yml new file mode 100644 index 0000000..6dee029 --- /dev/null +++ b/models/daily/package_downloads.yml @@ -0,0 +1,33 @@ +version: 2 + +models: + - name: package_downloads + config: + grants: + roles/bigquery.dataViewer: + - allAuthenticatedUsers + description: Download counts for packages + tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: + - download_date + - package + - package_version + columns: + - name: download_date + tests: + - not_null + - name: package + description: Package name + tests: + - not_null + - name: package_version + description: Unmodified package version string + tests: + - not_null + - name: download_count + description: Number of downloads recorded + tests: + - not_null + - dbt_utils.accepted_range: + min_value: 0 \ No newline at end of file diff --git a/models/file_downloads.sql b/models/file_downloads.sql new file mode 100644 index 0000000..a83123c --- /dev/null +++ b/models/file_downloads.sql @@ -0,0 +1,9 @@ +SELECT + *, + DATE(timestamp) AS download_date, + file.project AS package, + file.version AS package_version, + SPLIT(file.version, '.') package_version_semver_parts, + details.python AS python_version, + details.installer.name AS installer +FROM {{ source('pypi', 'file_downloads') }} \ No newline at end of file diff --git a/models/file_downloads.yml b/models/file_downloads.yml new file mode 100644 index 0000000..7986f89 --- /dev/null +++ b/models/file_downloads.yml @@ -0,0 +1,16 @@ +version: 2 + +models: + - name: file_downloads + description: Stable, enriched view of file_downloads source data + columns: + - name: package + description: Package the download is associated with. Examples `pipenv`, `nose` + - name: package_version + description: Package version for the download. Examples `0.1.6`, `1.4.2` + - name: python_version + description: Python version for the download. Examples `2.7.12`, `3.6.4` + - name: installer + description: Installer repsonsible for the download. Examples `pip`, `poetry` + - name: package_version_semver_parts + description: Array containing results of split by '.' operation, respecting ordering, on package version \ No newline at end of file diff --git a/models/overview.md b/models/overview.md new file mode 100644 index 0000000..6d11f75 --- /dev/null +++ b/models/overview.md @@ -0,0 +1,3 @@ +{% docs __overview__ %} +Front page for your DBT pages site +{% enddocs %} \ No newline at end of file diff --git a/models/sources/public_pypi.yml b/models/sources/public_pypi.yml new file mode 100644 index 0000000..6799765 --- /dev/null +++ b/models/sources/public_pypi.yml @@ -0,0 +1,16 @@ +version: 2 + +sources: + - name: pypi + database: bigquery-public-data + schema: pypi + description: See documentation at https://packaging.python.org/en/latest/guides/analyzing-pypi-package-downloads/#public-dataset + tables: + - name: file_downloads + tests: + - dbt_utils.recency: + datepart: day + field: timestamp + interval: 3 + config: + where: timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP, INTERVAL 4 DAY) \ No newline at end of file diff --git a/packages.yml b/packages.yml new file mode 100644 index 0000000..3518de4 --- /dev/null +++ b/packages.yml @@ -0,0 +1,3 @@ +packages: + - package: dbt-labs/dbt_utils + version: ">=1.1.1" \ No newline at end of file diff --git a/profiles.yml b/profiles.yml new file mode 100644 index 0000000..4ef3bb0 --- /dev/null +++ b/profiles.yml @@ -0,0 +1,15 @@ +config: + send_anonymous_usage_stats: false + use_colors: true + +current: + outputs: + current: + dataset: "{{ env_var('DBT_DATASET') }}" + location: "{{ env_var('DBT_LOCATION') }}" + method: oauth + priority: interactive + project: "{{ env_var('DBT_PROJECT') }}" + threads: "{{ env_var('DBT_THREADS', 8) }}" + type: bigquery + target: current diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9fe9e4d --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +dbt-bigquery>=1.7.0 \ No newline at end of file diff --git a/seeds/README.md b/seeds/README.md new file mode 100644 index 0000000..8cda191 --- /dev/null +++ b/seeds/README.md @@ -0,0 +1,16 @@ +Any seed data you need goes here. + +See https://docs.getdbt.com/docs/build/seeds + +# Gotchas + +## Type Inference + +dbt will try and infer types from column contents in your seed data. + +That causes problems when seed data is updated and the inference chooses a different data type for a column. +For example, a column that happens to contain only numbers gets a new value that's not a number. + +See: https://docs.getdbt.com/reference/resource-configs/column_types. + + diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..6c2072e --- /dev/null +++ b/tests/README.md @@ -0,0 +1,3 @@ +Any custom tests you need to write go here. + +See https://docs.getdbt.com/docs/build/data-tests#singular-data-tests for an example. \ No newline at end of file