From d42d6114c2f4181bdb5145574401c8ae28a01538 Mon Sep 17 00:00:00 2001 From: Zach Wilson Date: Thu, 30 Dec 2021 16:02:59 -0800 Subject: [PATCH] initial repo commit --- .idea/.gitignore | 5 +++ .idea/cumulative-table-design.iml | 12 ++++++ .idea/misc.xml | 6 +++ .idea/modules.xml | 8 ++++ .idea/vcs.xml | 6 +++ README.md | 12 ++++++ queries/active_users_cumulated_populate.sql | 48 +++++++++++++++++++++ queries/active_users_daily_populate.sql | 9 ++++ tables/active_users_cumulated.sql | 9 ++++ tables/active_users_daily.sql | 5 +++ 10 files changed, 120 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/cumulative-table-design.iml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 README.md create mode 100644 queries/active_users_cumulated_populate.sql create mode 100644 queries/active_users_daily_populate.sql create mode 100644 tables/active_users_cumulated.sql create mode 100644 tables/active_users_daily.sql diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..b58b603 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,5 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/cumulative-table-design.iml b/.idea/cumulative-table-design.iml new file mode 100644 index 0000000..24643cc --- /dev/null +++ b/.idea/cumulative-table-design.iml @@ -0,0 +1,12 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..28a804d --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..3a2391b --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..eaef50b --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +# Cumulative Table Design Example + +This repo shows how to build a robust cumulative table design with an example of monthly active users. + +In this repo, we'll be using **2022-01-01** as 'today' and **2021-12-31** as 'yesterday' + + +The steps for this design are: + +- The Daily Table step + - In this step we aggregate just the events of today to see who is daily active + - This query is pretty simple and straight forward check diff --git a/queries/active_users_cumulated_populate.sql b/queries/active_users_cumulated_populate.sql new file mode 100644 index 0000000..d170e3e --- /dev/null +++ b/queries/active_users_cumulated_populate.sql @@ -0,0 +1,48 @@ +INSERT INTO active_users_cumulated + +-- First read in yesterday from the cumulated table +WITH yesterday AS ( + SELECT * FROM active_users_cumulated + WHERE snapshot_date = '2021-12-31' +), +-- Read in the daily active user numbers for just today from the daily table +today AS ( + SELECT * FROM active_users_daily + WHERE snapshot_date = '2022-01-01' +), + +-- we FULL OUTER JOIN today and yesterday. We need to do some COALESCE both because +-- activity_array may not exist yet for a given user (i.e. they are brand new) +-- is_active_today may be null as well since it's null on days when a user didn't generate an event +combined AS ( +SELECT + -- We need to COALESCE here since t.user_id and y.user_id may be + COALESCE(y.user_id, t.user_id) AS user_id, + COALESCE( + IF(CARDINALITY( y.activity_array) < 30, + ARRAY[COALESCE(t.is_active_today, 0)] || y.activity_array, + ARRAY[COALESCE(t.is_active_today, 0)] || SLICE(y.activity_array, -1, 29) + ) + , ARRAY[t.is_active_today] + ) as activity_array, + t.snapshot_date + FROM yesterday y + FULL OUTER JOIN today t + ON y.user_id = t.user_id +) + +SELECT + user_id, + activity_array[1] AS is_daily_active, + -- if any of the array values are 1, then the user was active in the last month + CASE WHEN ARRAY_SUM(activity_array) > 0 THEN 1 ELSE 0 END AS is_monthly_active, + -- if any of the first 7 array values are non-zero, then the user was active in the last week + CASE WHEN ARRAY_SUM(SLICE(activity_array, 1, 7)) > 0 THEN 1 ELSE 0 END AS is_weekly_active + activity_array, + +FROM combined + + + + + diff --git a/queries/active_users_daily_populate.sql b/queries/active_users_daily_populate.sql new file mode 100644 index 0000000..d544063 --- /dev/null +++ b/queries/active_users_daily_populate.sql @@ -0,0 +1,9 @@ +INSERT INTO active_users_daily + +SELECT + user_id, + -- If the user_id has at least 1 event, they are daily active + IF(COUNT(user_id) > 0, 1, 0) as is_daily_active +FROM events +WHERE event_date = '2022-01-01' +GROUP BY user_id \ No newline at end of file diff --git a/tables/active_users_cumulated.sql b/tables/active_users_cumulated.sql new file mode 100644 index 0000000..386868b --- /dev/null +++ b/tables/active_users_cumulated.sql @@ -0,0 +1,9 @@ +create table active_users_cumulated ( + user_id: integer, + is_daily_active: integer, + is_weekly_active: integer, + is_monthly_active: integer, + activity_array: array, + snapshot_date: date +) + diff --git a/tables/active_users_daily.sql b/tables/active_users_daily.sql new file mode 100644 index 0000000..d70e45e --- /dev/null +++ b/tables/active_users_daily.sql @@ -0,0 +1,5 @@ +create table active_users_cumulated ( + user_id: integer, + is_active_today: integer, + snapshot≈_date: date +)