From d22ee990f8ad972ff662fce3895d42c8bdc901d6 Mon Sep 17 00:00:00 2001 From: ilias1111 <4363375+ilias1111@users.noreply.github.com> Date: Fri, 11 Oct 2024 09:14:26 +0000 Subject: [PATCH 1/4] [create-pull-request] automated change --- src/componentVersions.js | 2 +- .../Schemas/dbtUtils_0.17.0.json | 346 ++++++++++++++++++ 2 files changed, 347 insertions(+), 1 deletion(-) create mode 100644 src/components/JsonSchemaValidator/Schemas/dbtUtils_0.17.0.json diff --git a/src/componentVersions.js b/src/componentVersions.js index f65b19063b..d42566137b 100644 --- a/src/componentVersions.js +++ b/src/componentVersions.js @@ -45,7 +45,7 @@ export const versions = { dbtSnowplowUnified: '0.4.5', dbtSnowplowWeb: '1.0.1', dbtSnowplowMobile: '1.0.0', - dbtSnowplowUtils: '0.16.8', + dbtSnowplowUtils: '0.17.0', dbtSnowplowMediaPlayer: '0.8.0', dbtSnowplowNormalize: '0.3.5', dbtSnowplowFractribution: '0.3.6', diff --git a/src/components/JsonSchemaValidator/Schemas/dbtUtils_0.17.0.json b/src/components/JsonSchemaValidator/Schemas/dbtUtils_0.17.0.json new file mode 100644 index 0000000000..0cf430daab --- /dev/null +++ b/src/components/JsonSchemaValidator/Schemas/dbtUtils_0.17.0.json @@ -0,0 +1,346 @@ +{ + "definitions": { + "passthrough_vars": { + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "title": "Type", + "oneOf": [ + { + "type": "string", + "title": "Column Name" + }, + { + "type": "object", + "title": "SQL & Alias", + "properties": { + "sql": { + "type": "string", + "title": "Field SQL", + "format": "sql", + "order": 1 + }, + "alias": { + "type": "string", + "title": "Field Alias", + "order": 2 + } + }, + "required": [ + "sql", + "alias" + ], + "additionalProperties": false + } + ] + }, + "uniqueItems": true + } + }, + "type": "object", + "properties": { + "snowplow__database": { + "recommendFullRefresh": true, + "order": 1, + "consoleGroup": "required", + "type": "string", + "title": "Database", + "description": "Database that contains your atomic events", + "longDescription": "The database that contains your atomic events table.", + "packageDefault": "target.database", + "group": "Warehouse and Tracker" + }, + "snowplow__dev_target_name": { + "recommendFullRefresh": false, + "order": 87, + "consoleGroup": "advanced", + "type": "string", + "title": "Dev Target", + "description": "Target name of your development environment as defined in your `profiles.yml` file", + "longDescription": "The [target name](https://docs.getdbt.com/docs/core/connect-data-platform/profiles.yml) of your development environment as defined in your `profiles.yml` file. See the [Manifest Tables](/docs/modeling-your-data/modeling-your-data-with-dbt/package-mechanics/manifest-tables/) section for more details.", + "packageDefault": "dev", + "group": "Warehouse and Tracker" + }, + "snowplow__events_table": { + "recommendFullRefresh": true, + "order": 5, + "consoleGroup": "required", + "type": "string", + "title": "Events Table", + "description": "The name of the table that contains your atomic events", + "longDescription": "The name of the table that contains your atomic events.", + "packageDefault": "events", + "group": "Warehouse and Tracker" + }, + "snowplow__events_schema": { + "recommendFullRefresh": true, + "order": 4, + "consoleGroup": "basic", + "type": "string", + "title": "Events Schema", + "description": "The name of the schema that contains your atomic events", + "longDescription": "The name of the schema that contains your atomic events.", + "packageDefault": "events", + "group": "Warehouse and Tracker" + }, + "snowplow__allow_refresh": { + "recommendFullRefresh": true, + "order": 39, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Allow Refresh", + "group": "Operation and Logic", + "longDescription": "Used as the default value to return from the `allow_refresh()` macro. This macro determines whether the manifest tables can be refreshed or not, depending on your environment. See the [Manifest Tables](/docs/modeling-your-data/modeling-your-data-with-dbt/package-mechanics/manifest-tables/) section for more details.", + "packageDefault": "false" + }, + "snowplow__backfill_limit_days": { + "recommendFullRefresh": false, + "order": 41, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Backfill Limit", + "group": "Operation and Logic", + "longDescription": "The maximum numbers of days of new data to be processed since the latest event processed. Please refer to the [incremental logic](/docs/modeling-your-data/modeling-your-data-with-dbt/package-mechanics/incremental-processing/#package-state) section for more details.", + "packageDefault": "30", + "description": "The maximum numbers of days of new data to be processed since the latest event processed" + }, + "snowplow__custom_sql": { + "recommendFullRefresh": false, + "order": 84, + "consoleGroup": "advanced", + "type": "string", + "title": "Custom SQL", + "group": "Operation and Logic", + "longDescription": "This allows you to introduce custom sql to the `snowplow_base_events_this_run` table, which you can then leverage in downstream models. For more information on the usage, see the following page on the [advanced usage of the utils package](/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-custom-models/examples/additional-sql-on-events-this-run/).", + "packageDefault": "", + "description": "Custom SQL for your base events this run table." + }, + "snowplow__days_late_allowed": { + "recommendFullRefresh": true, + "order": 42, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Days Late Allowed", + "group": "Operation and Logic", + "longDescription": "The maximum allowed number of days between the event creation and it being sent to the collector. Exists to reduce lengthy table scans that can occur as a result of late arriving data.", + "packageDefault": "3", + "description": "The maximum allowed number of days between the event creation and it being sent to the collector" + }, + "snowplow__max_session_days": { + "recommendFullRefresh": true, + "order": 110, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Max Session Length", + "longDescription": "The maximum allowed session length in days. For a session exceeding this length, all events after this limit will stop being processed. Exists to reduce lengthy table scans that can occur due to long sessions which are usually a result of bots.", + "packageDefault": "3", + "group": "Operation and Logic", + "description": "The maximum allowed session length in days. For a session exceeding this length, all events after this limit will stop being processed" + }, + "snowplow__package_name": { + "recommendFullRefresh": false, + "order": 113, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Package Name", + "group": "Operation and Logic", + "longDescription": "The name of the package you are running this macro under. This has implications for your `manifest` table.", + "packageDefault": "snowplow", + "description": "The name of the package you are running this macro under" + }, + "snowplow__session_lookback_days": { + "recommendFullRefresh": false, + "order": 121, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Session Lookback Window", + "longDescription": "Number of days to limit scan on `snowplow_web_base_sessions_lifecycle_manifest` manifest. Exists to improve performance of model when we have a lot of sessions. Should be set to as large a number as practical.", + "packageDefault": "730", + "group": "Operation and Logic", + "description": "Number of days to limit scan on `snowplow_web_base_sessions_lifecycle_manifest` manifest" + }, + "snowplow__upsert_lookback_days": { + "recommendFullRefresh": false, + "order": 126, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Upsert Lookback Days", + "group": "Operation and Logic", + "longDescription": "Number of days to look back over the incremental derived tables during the upsert. Where performance is not a concern, should be set to as long a value as possible. Having too short a period can result in duplicates. Please see the [Snowplow Optimized Materialization](/docs/modeling-your-data/modeling-your-data-with-dbt/package-mechanics/optimized-upserts/) section for more details.", + "packageDefault": "30", + "description": "Number of days to look back over the incremental derived tables during the upsert" + }, + "snowplow__app_id": { + "recommendFullRefresh": false, + "order": 8, + "consoleGroup": "basic", + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "title": "App IDs", + "longDescription": "A list of `app_id`s to filter the events table on for processing within the package.", + "packageDefault": "[ ] (no filter applied)", + "group": "Contexts, Filters, and Logs", + "items": { + "type": "string" + } + }, + "snowplow__session_identifiers": { + "recommendFullRefresh": true, + "order": 46, + "consoleGroup": "advanced", + "title": "Session Identifiers", + "group": "Operation and Logic", + "longDescription": "A list of key:value dictionaries which contain all of the contexts and fields where your session identifiers are located. For each entry in the list, if your map contains the `schema` value `atomic`, then this refers to a field found directly in the atomic `events` table. If you are trying to introduce a context/entity with an identifier in it, the package will look for the context in your events table with the name specified in the `schema` field. It will use the specified value in the `field` key as the field name to access. For Redshift/Postgres, using the `schema` key the package will try to find a table in your `snowplow__events_schema` schema with the same name as the `schema` value provided, and join that. If multiple fields are specified, the package will try to coalesce all fields in the order specified in the list. For a better understanding of the advanced usage of this variable, please see the [Custom Identifiers](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/custom-identifiers/) section for more details.", + "packageDefault": "[{\"schema\" : \"atomic\", \"field\" : \"domain_sessionid\"}]", + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "type": "object", + "title": "Identifier", + "properties": { + "schema": { + "type": "string", + "title": "(JSON) schema name for the field", + "order": 1, + "description": "The schema name of your events table, atomic in most use cases, alternatively for sdes/contexts this should instead be the name of the field itself" + }, + "field": { + "type": "string", + "order": 2, + "title": "Field name", + "description": "The name of the field to use as session identifier, alternatively, in case of sdes/contexts it is the name of the element that refers to the field to be extracted" + } + }, + "required": [ + "schema", + "field" + ], + "additionalProperties": false + }, + "uniqueItems": true + }, + "snowplow__session_sql": { + "recommendFullRefresh": true, + "order": 47, + "consoleGroup": "advanced", + "type": "string", + "title": "SQL for your session identifier", + "longDescription": "This allows you to override the `session_identifiers` SQL, to define completely custom SQL in order to build out a session identifier for your events. If you are interested in using this instead of providing identifiers through the `session_identifiers` variable, please see the [Custom Identifiers](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/custom-identifiers/) section for more details on how to do that.", + "packageDefault": "", + "group": "Operation and Logic" + }, + "snowplow__session_timestamp": { + "recommendFullRefresh": false, + "order": 55, + "consoleGroup": "advanced", + "type": "string", + "title": "Timestamp used for incremental processing, should be your partition field", + "group": "Operation and Logic", + "longDescription": "Determines which timestamp is used to build the sessionization logic. It's a good idea to have this timestamp be the same timestamp as the field you partition your events table on.", + "packageDefault": "collector_tstamp" + }, + "snowplow__user_identifiers": { + "recommendFullRefresh": true, + "order": 48, + "consoleGroup": "advanced", + "title": "User Identifiers", + "group": "Operation and Logic", + "longDescription": "A list of key:value dictionaries which contain all of the contexts and fields where your user identifiers are located. For each entry in the list, if your map contains the `schema` value `atomic`, then this refers to a field found directly in the atomic `events` table. If you are trying to introduce a context/entity with an identifier in it, the package will look for the context in your events table with the name specified in the `schema` field. It will use the specified value in the `field` key as the field name to access. For Redshift/Postgres, using the `schema` key the package will try to find a table in your `snowplow__events_schema` schema with the same name as the `schema` value provided, and join that. If multiple fields are specified, the package will try to coalesce all fields in the order specified in the list. For a better understanding of the advanced usage of this variable, please see the [Custom Identifiers](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/custom-identifiers/) section for more details.", + "packageDefault": "[{\"schema\" : \"atomic\", \"field\" : \"domain_userid\"}]", + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "type": "object", + "title": "Identifier", + "properties": { + "schema": { + "type": "string", + "title": "(JSON) schema name for the field", + "order": 1, + "description": "The schema name of your events table, atomic in most use cases, alternatively for sdes/contexts this should instead be the name of the field itself" + }, + "field": { + "type": "string", + "order": 2, + "title": "Field name", + "description": "The name of the field to use as user identifier, alternatively, in case of sdes/contexts it is the name of the element that refers to the field to be extracted" + } + }, + "required": [ + "schema", + "field" + ], + "additionalProperties": false + }, + "uniqueItems": true + }, + "snowplow__user_sql": { + "recommendFullRefresh": true, + "order": 49, + "consoleGroup": "advanced", + "type": "string", + "title": "SQL for your user identifier", + "longDescription": "This allows you to override the `user_identifiers` SQL, to define completely custom SQL in order to build out a user identifier for your events. If you are interested in using this instead of providing identifiers through the `user_identifiers` variable, please see the [Custom Identifiers](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/custom-identifiers/) section for more details on how to do that.", + "packageDefault": "", + "group": "Operation and Logic" + }, + "snowplow__entities_or_sdes": { + "recommendFullRefresh": false, + "order": 104, + "consoleGroup": "advanced", + "title": "(Redshift) Entities or SDEs", + "longDescription": "A list of dictionaries defining the `context` or `self-describing` event tables to join onto your base events table. Please use the tool below or see the section on [Utilizing custom contexts or SDEs](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/modeling-entities/) for details of the structure.", + "packageDefault": "[]", + "warehouse": "Redshift", + "group": "Warehouse Specific", + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "type": "object", + "title": "Entity or SDE", + "properties": { + "schema": { + "type": "string", + "title": "Table name", + "description": "Table name", + "order": 1 + }, + "prefix": { + "type": "string", + "title": "Column prefix", + "description": "Prefix to add to columns", + "order": 2 + }, + "alias": { + "type": "string", + "title": "CTE Alias", + "description": "Table alias for the subquery", + "order": 3 + }, + "single_entity": { + "type": "boolean", + "title": "Is single entity?", + "order": 4 + } + }, + "required": [ + "schema", + "prefix" + ], + "additionalProperties": false + }, + "uniqueItems": true + } + } +} From 24784a5324e6f640341fc40d21a572d768ef3303 Mon Sep 17 00:00:00 2001 From: Ilias Xenogiannis Date: Fri, 11 Oct 2024 14:16:01 +0300 Subject: [PATCH 2/4] Update spark platform information --- .../dbt-configuration/index.md | 17 +++++++++++++++++ .../dbt-operation/lakes/index.md | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-configuration/index.md b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-configuration/index.md index 04ce025541..3566b8016f 100644 --- a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-configuration/index.md +++ b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-configuration/index.md @@ -44,6 +44,23 @@ To optimism performance of large Postgres datasets you can create [indexes](http }} ``` +### Spark + +For Spark environments, Iceberg is currently the supported file format for external tables. We have successfully tested this setup using both Glue and Thrift as connection methods. To use these models, create an external table from the Iceberg lake format in Spark and point your dbt model to this table. + +Here's an example profiles.yml configuration for Spark using Thrift: +``` yaml +spark: + type: spark + host: localhost + method: thrift + port: 10000 + schema: default +``` + +In your dbt_project.yml, the file_format is set to iceberg by default for Spark. While you can override this in your project's dbt YAML file to use a different file format, please note that Iceberg is currently the only officially supported format. + + ### Databricks You can connect to Databricks using either the `dbt-spark` or the `dbt-databricks` connectors. The `dbt-spark` adapter does not allow dbt to take advantage of certain features that are unique to Databricks, which you can take advantage of when using the `dbt-databricks` adapter. Where possible, we would recommend using the `dbt-databricks` adapter. diff --git a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md index f1c7c7428c..44e5fc6d18 100644 --- a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md +++ b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md @@ -27,7 +27,7 @@ At time of writing, `Iceberg` is the preferred file format for Snowflake [iceber Note that compared to the other loaders for Snowflake, that field names in Self-describing events and Entities are converted to `snake_case` format (the other loaders retain the format used in the schema, often `camelCase`). You will need to adjust other variables and inputs accordingly compared to what you may find in the docs. # Spark -Currently using spark directly as a compute engine is not supported for our packages. +At time of writing, `Iceberg` is the supported file format for Spark external tables. We've tested this using Glue and Thrift as a connection method. If you create an external table from this lake format in Spark, you should be able to run the models by pointing the model at this table. For more information on setting up dbt with Spark using Thrift, please refer to the [dbt Spark documentation on Thrift](https://docs.getdbt.com/docs/core/connect-data-platform/spark-setup#thrift). # Redshift (spectrum) Currently using Redshift Spectrum tables is not supported for our packages due to [limitations](https://docs.aws.amazon.com/redshift/latest/dg/nested-data-restrictions.html) with the platform. From c4152b645709e8d92268b59493c0084e2b880c3b Mon Sep 17 00:00:00 2001 From: Ilias Xenogiannis Date: Mon, 14 Oct 2024 12:52:21 +0300 Subject: [PATCH 3/4] Update index.md --- .../modeling-your-data-with-dbt/dbt-operation/lakes/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md index 44e5fc6d18..c9ce9a7e3c 100644 --- a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md +++ b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md @@ -27,7 +27,7 @@ At time of writing, `Iceberg` is the preferred file format for Snowflake [iceber Note that compared to the other loaders for Snowflake, that field names in Self-describing events and Entities are converted to `snake_case` format (the other loaders retain the format used in the schema, often `camelCase`). You will need to adjust other variables and inputs accordingly compared to what you may find in the docs. # Spark -At time of writing, `Iceberg` is the supported file format for Spark external tables. We've tested this using Glue and Thrift as a connection method. If you create an external table from this lake format in Spark, you should be able to run the models by pointing the model at this table. For more information on setting up dbt with Spark using Thrift, please refer to the [dbt Spark documentation on Thrift](https://docs.getdbt.com/docs/core/connect-data-platform/spark-setup#thrift). +At time of writing, `Iceberg` is the supported file format for Spark external tables. We've tested this using Glue and Thrift as a connection method. If you have your event data in Iceberg format in a lake, you should be able to run the models by pointing the packages to a spark deployment, connected to that lake. For more information on setting up dbt with Spark using Thrift, please refer to the [dbt Spark documentation on Thrift](https://docs.getdbt.com/docs/core/connect-data-platform/spark-setup#thrift). # Redshift (spectrum) Currently using Redshift Spectrum tables is not supported for our packages due to [limitations](https://docs.aws.amazon.com/redshift/latest/dg/nested-data-restrictions.html) with the platform. From 078697532843483fbf9f8093f1366bb88b5ba85a Mon Sep 17 00:00:00 2001 From: Ilias Xenogiannis Date: Mon, 14 Oct 2024 14:19:11 +0300 Subject: [PATCH 4/4] Update docs/modeling-your-data/modeling-your-data-with-dbt/dbt-configuration/index.md Co-authored-by: Agnes Kiss <95634439+agnessnowplow@users.noreply.github.com> --- .../modeling-your-data-with-dbt/dbt-configuration/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-configuration/index.md b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-configuration/index.md index 3566b8016f..8daa877777 100644 --- a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-configuration/index.md +++ b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-configuration/index.md @@ -58,7 +58,7 @@ spark: schema: default ``` -In your dbt_project.yml, the file_format is set to iceberg by default for Spark. While you can override this in your project's dbt YAML file to use a different file format, please note that Iceberg is currently the only officially supported format. +In your dbt_project.yml, the file_format is set to `iceberg` by default for Spark. While you can override this in your project's dbt YAML file to use a different file format, please note that Iceberg is currently the only officially supported format. ### Databricks