-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalysis.sql
28 lines (25 loc) · 1.3 KB
/
analysis.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
-- Data analysis utility methods to guide admins in correctly configuring Diffix.
-- Inspects all columns in a table for pre-anonymization filtering safety.
-- Warns when a column should be rejected from filtering in untrusted-mode.
CREATE OR REPLACE PROCEDURE analyze_filterability(table_name text)
AS $$
DECLARE
rows_count bigint;
column_name text;
top_occurrences bigint;
BEGIN
EXECUTE 'SELECT COUNT(*) FROM ' || table_name INTO STRICT rows_count;
RAISE INFO 'Analyzing filterability of table `%` with % rows...', table_name, rows_count;
FOR column_name IN EXECUTE 'SELECT attname FROM pg_attribute
WHERE attrelid = ' || table_name::regclass::oid || ' AND attnum > 0 AND NOT attisdropped'
LOOP
RAISE INFO 'Inspecting column `%`...', column_name;
EXECUTE 'SELECT COUNT(' || column_name || ') AS occurrences FROM ' || table_name ||
' GROUP BY ' || column_name || ' ORDER BY occurrences DESC LIMIT 1' INTO STRICT top_occurrences;
IF top_occurrences >= 0.65 * rows_count THEN -- warn if top value occurs in more than 65% of rows
RAISE WARNING 'Column `%` is dominated by one value and should be marked as `not_filterable`!', column_name;
END IF;
END LOOP;
RAISE INFO 'Filterability analysis is complete.';
END;
$$ LANGUAGE plpgsql;