diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index cf8c9453..00000000
Binary files a/.DS_Store and /dev/null differ
diff --git a/.gitignore b/.gitignore
index a6fdd37d..46291105 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,10 @@ report/
sql/proprietary/
+# macOS specific
+.DS_Store
+*.dylib
+
# GitHub generated ignores
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/README.md b/README.md
index 359d7a7b..f0dbb175 100644
--- a/README.md
+++ b/README.md
@@ -164,7 +164,8 @@ substrings of `pg_hints`can be mentioned there:
-- accept: a b c
-- reject: NestLoop
-- max_timeout: 5s
--- tags: muted_nlj, 5s_max
+-- tags: muted_nlj, 5s_max, skip_consistency_check
+-- debug_hints: set (yb_enable_optimizer_statistics false)
select a.c1,
a.c2, ...
@@ -177,6 +178,12 @@ After optimizations are generated, the framework evaluates all of them with maxi
equal to current minimum execution time (starts with original optimization timeout) so do not spend
time on worst cases.
+#### Result validation
+
+TAQO implements consistency checks, which are reflected in the collect and report logs.
+In the event that certain optimizations yield varying results, these discrepancies will be brought to attention.
+To disable this feature, you can add the skip_consistency_check tag.
+
----
## Report
@@ -251,7 +258,7 @@ session-props = [
skip-percentage-delta = 0.15
# query execution related options
-ddl-query-timeout = 3600 # skip DDL if they evaluated in more than 1200 seconds
+ddl-query-timeout = 3600 # skip DDL if they evaluated in more than 3600 seconds
test-query-timeout = 1200 # skip queries if they evaluated in more than 1200 seconds
# optimization generation
@@ -333,8 +340,6 @@ options:
--username USERNAME Username for connection
--password PASSWORD Password for user for connection
--database DATABASE Target database in postgres compatible database
- --enable-statistics, --no-enable-statistics
- Evaluate yb_enable_optimizer_statistics before running queries (default: False)
--explain-clause EXPLAIN_CLAUSE
Explain clause that will be placed before query. Default "EXPLAIN"
--num-queries NUM_QUERIES
@@ -355,19 +360,19 @@ See prepared scenarios in `bin/` directory
Collect queries results for basic model for localhost cluster
```
-src/runner.py
+python3 src/runner.py
collect
--optimizations
--model=basic
---config=config/qo.conf
---output=taqo_complex_yb
+--config=config/default.conf
+--output=taqo_basic_yb
--database=taqo
```
Generate comparison report for 2 previous collect runs
```
-src/runner.py
+python3 src/runner.py
report
--type=regression
--config=config/qo.conf
@@ -378,10 +383,10 @@ report
Generate score report which contains taqo analysis and comparison with postgres
```
-src/runner.py
+python3 src/runner.py
report
--type=score
--config=config/qo.conf
--results=report/basic_taqo_yb.json
--pg-results=report/basic_taqo_pg.json
-```
\ No newline at end of file
+```
diff --git a/css/adoc.css b/adoc/adoc.css
similarity index 93%
rename from css/adoc.css
rename to adoc/adoc.css
index a0143dd1..f99f051c 100644
--- a/css/adoc.css
+++ b/adoc/adoc.css
@@ -1,6 +1,14 @@
/* Based on Cosmo from Bootswatch (https://bootswatch.com/cosmo/) */
@import url("//fonts.googleapis.com/css?family=Source+Sans+Pro:300,400,700");
+/* table cell alignment attribute definitions from Asciidoctor default css */
+th.halign-left,td.halign-left{text-align:left}
+th.halign-right,td.halign-right{text-align:right}
+th.halign-center,td.halign-center{text-align:center}
+th.valign-top,td.valign-top{vertical-align:top}
+th.valign-bottom,td.valign-bottom{vertical-align:bottom}
+th.valign-middle,td.valign-middle{vertical-align:middle}
+
/* document body (contains all content) */
body {
font-family: "Source Sans Pro", Calibri, Candara, Arial, sans-serif;
@@ -356,4 +364,4 @@ video {
.black {color: #000000}
.red {color: #911717}
.blue {color: #3a3aac}
-.green {color:#080}
\ No newline at end of file
+.green {color:#080}
diff --git a/adoc/copy-to-clipboard-docinfo-processor.rb b/adoc/copy-to-clipboard-docinfo-processor.rb
new file mode 100644
index 00000000..65da3c44
--- /dev/null
+++ b/adoc/copy-to-clipboard-docinfo-processor.rb
@@ -0,0 +1,7 @@
+RUBY_ENGINE == 'opal' ? (require 'copy-to-clipboard-docinfo-processor/extension') : (require_relative 'copy-to-clipboard-docinfo-processor/extension')
+
+Asciidoctor::Extensions.register do
+ tree_processor CopyToClipboardTreeProcessor
+ docinfo_processor CopyToClipboardStylesDocinfoProcessor
+ docinfo_processor CopyToClipboardBehaviorDocinfoProcessor
+end
diff --git a/adoc/copy-to-clipboard-docinfo-processor/behavior.js b/adoc/copy-to-clipboard-docinfo-processor/behavior.js
new file mode 100644
index 00000000..fda7a003
--- /dev/null
+++ b/adoc/copy-to-clipboard-docinfo-processor/behavior.js
@@ -0,0 +1,83 @@
+/*! https://gitlab.com/antora/antora-ui-default/-/blob/master/src/js/06-copy-to-clipboard.js | License: MPL-2.0 */
+;(function () {
+ 'use strict'
+
+ var CMD_RX = /^\$ (\S[^\\\n]*(\\\n(?!\$ )[^\\\n]*)*)(?=\n|$)/gm
+ var LINE_CONTINUATION_RX = /( ) *\\\n *|\\\n( ?) */g
+ var TRAILING_SPACE_RX = / +$/gm
+ var LEADING_SPACE_NUMBER_RX = /^ ?\d+/gm;
+
+ var config = (document.getElementById('site-script') || { dataset: {} }).dataset
+ var supportsCopy = window.navigator.clipboard
+ var svgAs = config.svgAs
+ var uiRootPath = (config.uiRootPath == null ? window.uiRootPath : config.uiRootPath) || '.'
+
+ ;[].slice.call(document.querySelectorAll('.doc pre.highlight, .doc .literalblock pre')).forEach(function (pre) {
+ var code, language, lang, copy, toast, toolbox
+ if (pre.classList.contains('highlight')) {
+ code = pre.querySelector('code')
+ if ((language = code.dataset.lang) && language !== 'console') {
+ ;(lang = document.createElement('span')).className = 'source-lang'
+ lang.appendChild(document.createTextNode(language))
+ }
+ } else if (pre.innerText.startsWith('$ ')) {
+ var block = pre.parentNode.parentNode
+ block.classList.remove('literalblock')
+ block.classList.add('listingblock')
+ pre.classList.add('highlightjs', 'highlight')
+ ;(code = document.createElement('code')).className = 'language-console hljs'
+ code.dataset.lang = 'console'
+ code.appendChild(pre.firstChild)
+ pre.appendChild(code)
+ } else {
+ return
+ }
+ ;(toolbox = document.createElement('div')).className = 'source-toolbox'
+ if (lang) toolbox.appendChild(lang)
+ // TODO not the best solution?
+ // if (supportsCopy) {
+ ;(copy = document.createElement('button')).className = 'copy-button'
+ copy.setAttribute('title', 'Copy to clipboard')
+ if (svgAs === 'svg') {
+ var svg = document.createElementNS('http://www.w3.org/2000/svg', 'svg')
+ svg.setAttribute('class', 'copy-icon')
+ var use = document.createElementNS('http://www.w3.org/2000/svg', 'use')
+ use.setAttribute('href', uiRootPath + '/img/octicons-16.svg#icon-clippy')
+ svg.appendChild(use)
+ copy.appendChild(svg)
+ } else {
+ var img = document.createElement('img')
+ img.src = uiRootPath + '/img/octicons-16.svg#view-clippy'
+ img.alt = 'copy icon'
+ img.className = 'copy-icon'
+ copy.appendChild(img)
+ }
+ ;(toast = document.createElement('span')).className = 'copy-toast'
+ toast.appendChild(document.createTextNode('Copied!'))
+ copy.appendChild(toast)
+ toolbox.appendChild(copy)
+ // }
+ pre.parentNode.appendChild(toolbox)
+ if (copy) copy.addEventListener('click', writeToClipboard.bind(copy, code))
+ })
+
+ function extractCommands (text) {
+ var cmds = []
+ var m
+ while ((m = CMD_RX.exec(text))) cmds.push(m[1].replace(LINE_CONTINUATION_RX, '$1$2'))
+ return cmds.join(' && ')
+ }
+
+ function writeToClipboard (code) {
+ var text = code.innerText.replace(TRAILING_SPACE_RX, '').replace(/''/g, "'").replace(LEADING_SPACE_NUMBER_RX, '')
+ if (code.dataset.lang === 'console' && text.startsWith('$ ')) text = extractCommands(text)
+ window.navigator.clipboard.writeText(text).then(
+ function () {
+ this.classList.add('clicked')
+ this.offsetHeight // eslint-disable-line no-unused-expressions
+ this.classList.remove('clicked')
+ }.bind(this),
+ function () {}
+ )
+ }
+})()
diff --git a/adoc/copy-to-clipboard-docinfo-processor/extension.rb b/adoc/copy-to-clipboard-docinfo-processor/extension.rb
new file mode 100644
index 00000000..bac317aa
--- /dev/null
+++ b/adoc/copy-to-clipboard-docinfo-processor/extension.rb
@@ -0,0 +1,34 @@
+class CopyToClipboardTreeProcessor < Asciidoctor::Extensions::TreeProcessor
+ def process doc
+ doc.add_role 'doc' unless doc.has_role? 'doc'
+ nil
+ end
+end
+
+class CopyToClipboardStylesDocinfoProcessor < Asciidoctor::Extensions::DocinfoProcessor
+ use_dsl
+ at_location :head
+
+ def process doc
+ extdir = ::File.join ::File.dirname __FILE__
+ <<-EOS
+
+ EOS
+ end
+end
+
+class CopyToClipboardBehaviorDocinfoProcessor < Asciidoctor::Extensions::DocinfoProcessor
+ use_dsl
+ at_location :footer
+
+ def process doc
+ extdir = ::File.join ::File.dirname __FILE__
+ <<-EOS
+
+ EOS
+ end
+end
diff --git a/adoc/copy-to-clipboard-docinfo-processor/img/octicons-16.svg b/adoc/copy-to-clipboard-docinfo-processor/img/octicons-16.svg
new file mode 100644
index 00000000..d8415d0a
--- /dev/null
+++ b/adoc/copy-to-clipboard-docinfo-processor/img/octicons-16.svg
@@ -0,0 +1,36 @@
+
diff --git a/adoc/copy-to-clipboard-docinfo-processor/sample.adoc b/adoc/copy-to-clipboard-docinfo-processor/sample.adoc
new file mode 100644
index 00000000..18b53350
--- /dev/null
+++ b/adoc/copy-to-clipboard-docinfo-processor/sample.adoc
@@ -0,0 +1,16 @@
+= Sample
+:source-highlighter: highlight.js
+
+[,ruby]
+----
+Asciidoctor::Extensions.register do
+ block do
+ named :sample
+ on_context :open
+
+ process do |parent, reader, attrs|
+ create_paragraph parent, reader.lines, attrs
+ end
+ end
+end
+----
diff --git a/adoc/copy-to-clipboard-docinfo-processor/styles.css b/adoc/copy-to-clipboard-docinfo-processor/styles.css
new file mode 100644
index 00000000..d10dcae7
--- /dev/null
+++ b/adoc/copy-to-clipboard-docinfo-processor/styles.css
@@ -0,0 +1,92 @@
+/*! derived from https://gitlab.com/antora/antora-ui-default/-/blob/master/src/css/doc.css | License: MPL-2.0 */
+.doc .listingblock > .content {
+ position: relative;
+}
+
+.doc .listingblock code[data-lang]::before {
+ content: none;
+}
+
+.doc .source-toolbox {
+ display: flex;
+ position: absolute;
+ visibility: hidden;
+ top: 0.25rem;
+ right: 0.5rem;
+ color: #808080;
+ white-space: nowrap;
+ font-size: 0.85em;
+}
+
+.doc .listingblock:hover .source-toolbox {
+ visibility: visible;
+}
+
+.doc .source-toolbox .source-lang {
+ font-family: "Droid Sans Mono", "DejaVu Sans Mono", monospace;
+ text-transform: uppercase;
+ letter-spacing: 0.075em;
+}
+
+.doc .source-toolbox > :not(:last-child)::after {
+ content: "|";
+ letter-spacing: 0;
+ padding: 0 1ch;
+}
+
+.doc .source-toolbox .copy-button {
+ cursor: pointer;
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ background: none;
+ border: none;
+ color: inherit;
+ outline: none;
+ padding: 0;
+ font-family: inherit;
+ font-size: inherit;
+ line-height: inherit;
+ width: 1em;
+ height: 1em;
+}
+
+.doc .source-toolbox .copy-icon {
+ flex: none;
+ width: inherit;
+ height: inherit;
+ filter: invert(50.2%);
+ margin-top: 0.05em;
+}
+
+.doc .source-toolbox .copy-toast {
+ flex: none;
+ position: relative;
+ display: inline-flex;
+ justify-content: center;
+ margin-top: 1em;
+ border-radius: 0.25em;
+ padding: 0.5em;
+ cursor: auto;
+ opacity: 0;
+ transition: opacity 0.5s ease 0.75s;
+ background: rgba(0, 0, 0, 0.8);
+ color: #fff;
+}
+
+.doc .source-toolbox .copy-toast::after {
+ content: "";
+ position: absolute;
+ top: 0;
+ width: 1em;
+ height: 1em;
+ border: 0.55em solid transparent;
+ border-left-color: rgba(0, 0, 0, 0.8);
+ transform: rotate(-90deg) translateX(50%) translateY(50%);
+ transform-origin: left;
+}
+
+.doc .source-toolbox .copy-button.clicked .copy-toast {
+ opacity: 1;
+ transition: none;
+}
diff --git a/bin/selectivity.sh b/bin/selectivity.sh
index c7fab3bc..ec0a7fea 100755
--- a/bin/selectivity.sh
+++ b/bin/selectivity.sh
@@ -35,10 +35,10 @@ python3 src/runner.py collect --model=$model --config=$config --output=ta_$model
echo "Evaluating table stats test against $rev with table analyze"
python3 src/runner.py collect --model=$model --config=$config --output=taa_$model$rev --ddls=none --explain-clause="explain analyze" --yes
-echo "Evaluating selectivity test against $rev"
-python3 src/runner.py collect --model=$model --config=$config --output=tsa_$model$rev --ddls=none --enable-statistics --explain-clause="explain" --yes
-echo "Evaluating selectivity test against $rev with table analyze"
-python3 src/runner.py collect --model=$model --config=$config --output=tsaa_$model$rev --ddls=none --explain-clause="explain analyze" --enable-statistics --yes
+echo "Evaluating CBO test against $rev"
+python3 src/runner.py collect --model=$model --config=$config --output=tsa_$model$rev --ddls=none --session-props="SET yb_enable_optimizer_statistics = true;" --explain-clause="explain" --yes
+echo "Evaluating CBO test against $rev with table analyze"
+python3 src/runner.py collect --model=$model --config=$config --output=tsaa_$model$rev --ddls=none --explain-clause="explain analyze" --session-props="SET yb_enable_optimizer_statistics = true;" --yes
echo "Generating report"
python3 src/runner.py report --type=selectivity --config=$config \
diff --git a/config/default.conf b/config/default.conf
index d6717f50..be1f5beb 100644
--- a/config/default.conf
+++ b/config/default.conf
@@ -3,7 +3,11 @@ source-path = "/yugabyte-db"
# optional if local code or archive test run
num-nodes = 3
-# default explain clause
+# query execution related options
+ddl-query-timeout = 7200 # skip DDL queries if they evaluated in more than 7200 seconds
+test-query-timeout = 1200 # skip queries if they evaluated in more than 1200 seconds
+compaction-timeout = 120 # todo timeout between compaction operations retries for YBDB
+
# will be used in TAQO, regression, comparison tests as a plan extraction command
explain-clause = "explain "
# session properties before executing set of testing queries
@@ -12,19 +16,17 @@ session-props = [
"SET pg_hint_plan.debug_print = ON;",
"SET client_min_messages TO log;",
"SET pg_hint_plan.message_level = debug;",
+ "SET temp_file_limit=\"8182MB\";",
]
# allowed diff between queries (all tests included)
-skip-percentage-delta = 0.15
-
-# query execution related options
-ddl-query-timeout = 3600 # skip DDL queries if they evaluated in more than 1200 seconds
-test-query-timeout = 1200 # skip queries if they evaluated in more than 1200 seconds
+skip-percentage-delta = 0.05
-# optimization generation
+# TAQO related options
skip-timeout-delta = 1 # skip queries if they exceed (min+1) seconds
all-pairs-threshold = 3 # maximum number of tables after which all_pairs will be used, -1 to use all combinations always
look-near-best-plan = true # evaluate only queries that are near current best optimization
+all-index-check = true # check all indexes explicitly in scans otherwise use table_name
# limit number of queries in model, needed for debug
num-queries = -1
diff --git a/requirements.txt b/requirements.txt
index 4934afb4..40c813d0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,10 @@ dacite~=1.8.0
sqlparse~=0.4.4
sql_metadata
sql_formatter
+pglast
allpairspy~=2.5.0
xlsxwriter
-numpy~=1.24.2
\ No newline at end of file
+numpy~=1.24.2
+scipy
+bokeh~=3.6.2
+requests
\ No newline at end of file
diff --git a/scripts/generate_basic_data.py b/scripts/generate_basic_data.py
index 5055f7ba..9c943bf0 100644
--- a/scripts/generate_basic_data.py
+++ b/scripts/generate_basic_data.py
@@ -2,7 +2,7 @@
import os
import string
from os.path import exists
-from random import random, choices
+from random import choices, seed
from tqdm import tqdm
@@ -10,7 +10,7 @@
def generate_data(multiplier):
print("Generating data files for simplified model")
- random.seed = 2023
+ seed(2023)
# create dir if not there yet
if not exists(f"{os.path.abspath(os.getcwd())}/sql/basic/data"):
diff --git a/sql/basic/create.sql b/sql/basic/create.sql
index 2c1948f9..fbcf058c 100644
--- a/sql/basic/create.sql
+++ b/sql/basic/create.sql
@@ -1,32 +1,47 @@
-CREATE TABLE t1(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 ASC, k2 ASC)) WITH (colocation = true);
+CREATE TABLE t1
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1 ASC, k2 ASC)
+);
CREATE INDEX ON t1(v1 ASC, k2 ASC);
-CREATE TABLE t2(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 ASC, k2 ASC)) WITH (colocation = true);
+CREATE TABLE t2
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1 ASC, k2 ASC)
+);
CREATE INDEX ON t2(v1 ASC, k2 ASC);
-CREATE TABLE t3(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 ASC, k2 ASC)) WITH (colocation = true);
+CREATE TABLE t3
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1 ASC, k2 ASC)
+);
CREATE INDEX ON t3(v1 ASC, k2 ASC);
-CREATE TABLE ts2(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 DESC, k2 DESC)) WITH (colocation = true);
+CREATE TABLE ts2
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1 ASC, k2 ASC)
+);
-CREATE TABLE ts3(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 DESC)) WITH (colocation = true);
+CREATE TABLE ts3
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1 ASC)
+);
diff --git a/sql/basic/hash.create.sql b/sql/basic/hash.create.sql
new file mode 100644
index 00000000..898f1a92
--- /dev/null
+++ b/sql/basic/hash.create.sql
@@ -0,0 +1,47 @@
+CREATE TABLE t1
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY ((k1, k2) HASH)
+);
+CREATE INDEX ON t1((v1, k2) HASH);
+
+CREATE TABLE t2
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY ((k1, k2) HASH)
+);
+CREATE INDEX ON t2((k1, k2) HASH);
+
+CREATE TABLE t3
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY ((k1, k2) HASH)
+);
+CREATE INDEX ON t3((k1, k2) HASH);
+
+CREATE TABLE ts2
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY ((k1, k2) HASH)
+);
+
+CREATE TABLE ts3
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1 HASH)
+);
diff --git a/sql/basic/obsolete.create.sql b/sql/basic/obsolete.create.sql
deleted file mode 100644
index e1607c10..00000000
--- a/sql/basic/obsolete.create.sql
+++ /dev/null
@@ -1,32 +0,0 @@
-CREATE TABLE t1(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 ASC, k2 ASC)) WITH (colocated = true);
-CREATE INDEX ON t1(v1 ASC, k2 ASC);
-
-CREATE TABLE t2(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 ASC, k2 ASC)) WITH (colocated = true);
-CREATE INDEX ON t2(v1 ASC, k2 ASC);
-
-CREATE TABLE t3(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 ASC, k2 ASC)) WITH (colocated = true);
-CREATE INDEX ON t3(v1 ASC, k2 ASC);
-
-CREATE TABLE ts2(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 DESC, k2 DESC)) WITH (colocated = true);
-
-CREATE TABLE ts3(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1 DESC)) WITH (colocated = true);
diff --git a/sql/basic/postgres.create.sql b/sql/basic/postgres.create.sql
index 68f44096..97a6b99d 100644
--- a/sql/basic/postgres.create.sql
+++ b/sql/basic/postgres.create.sql
@@ -1,32 +1,47 @@
-CREATE TABLE t1(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1, k2));
-CREATE INDEX ON t1(v1, k2);
+CREATE TABLE t1
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1, k2)
+);
+CREATE INDEX ON t1 (v1, k2);
-CREATE TABLE t2(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1, k2));
-CREATE INDEX ON t2(v1, k2);
+CREATE TABLE t2
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1, k2)
+);
+CREATE INDEX ON t2 (v1, k2);
-CREATE TABLE t3(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1, k2));
-CREATE INDEX ON t3(v1, k2);
+CREATE TABLE t3
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1, k2)
+);
+CREATE INDEX ON t3 (v1, k2);
-CREATE TABLE ts2(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1, k2));
+CREATE TABLE ts2
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1, k2)
+);
-CREATE TABLE ts3(k1 int,
- k2 text,
- v1 int,
- v2 text,
- PRIMARY KEY(k1));
+CREATE TABLE ts3
+(
+ k1 int,
+ k2 text,
+ v1 int,
+ v2 text,
+ PRIMARY KEY (k1)
+);
diff --git a/sql/basic/stats.sql b/sql/basic/stats.sql
deleted file mode 100644
index 84e979fe..00000000
--- a/sql/basic/stats.sql
+++ /dev/null
@@ -1,5 +0,0 @@
-SELECT (select max(k1) from t1),
- (select max(k1) from t2),
- (select max(k1) from t3),
- (select max(k1) from ts2),
- (select max(k1) from ts3);
\ No newline at end of file
diff --git a/sql/complex/create.sql b/sql/complex/create.sql
index 286f32dc..20c6b809 100644
--- a/sql/complex/create.sql
+++ b/sql/complex/create.sql
@@ -1,5 +1,5 @@
CREATE TABLE t1000000
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
@@ -8,34 +8,34 @@ SELECT c_int,
(c_int + 0.2):: float as c_float,
(c_int + 0.3):: real as c_real,
(c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
-CREATE INDEX t1000000_1_idx ON t1000000 (c_int);
-CREATE INDEX t1000000_2_idx ON t1000000 (c_int, c_bool);
-CREATE INDEX t1000000_3_idx ON t1000000 (c_int, c_text);
-CREATE INDEX t1000000_4_idx ON t1000000 (c_int, c_varchar);
-CREATE INDEX t1000000_5_idx ON t1000000 (c_float, c_text, c_varchar);
-CREATE INDEX t1000000_6_idx ON t1000000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t1000000_7_idx ON t1000000 (c_float, c_real, c_money);
+CREATE INDEX t1000000_1_idx ON t1000000 (c_int ASC);
+CREATE INDEX t1000000_2_idx ON t1000000 (c_int ASC, c_bool ASC);
+CREATE INDEX t1000000_3_idx ON t1000000 (c_int ASC, c_text ASC);
+CREATE INDEX t1000000_4_idx ON t1000000 (c_int ASC, c_varchar ASC);
+CREATE INDEX t1000000_5_idx ON t1000000 (c_float ASC, c_text ASC, c_varchar ASC);
+CREATE INDEX t1000000_6_idx ON t1000000 (c_float ASC, c_decimal ASC, c_varchar ASC);
+CREATE INDEX t1000000_7_idx ON t1000000 (c_float ASC, c_real ASC, c_money ASC);
CREATE TABLE t500000
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 50000 * $MULTIPLIER) c_int;
-CREATE INDEX t500000_1_idx ON t500000 (c_int);
-CREATE INDEX t500000_2_idx ON t500000 (c_int, c_bool);
-CREATE INDEX t500000_3_idx ON t500000 (c_int, c_text);
-CREATE INDEX t500000_4_idx ON t500000 (c_int, c_varchar);
-CREATE INDEX t500000_5_idx ON t500000 (c_float, c_text, c_varchar);
-CREATE INDEX t500000_6_idx ON t500000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t500000_7_idx ON t500000 (c_float, c_real, c_money);
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series(1, 50000 * $MULTIPLIER) c_int;
+CREATE INDEX t500000_1_idx ON t500000 (c_int ASC);
+CREATE INDEX t500000_2_idx ON t500000 (c_int ASC, c_bool ASC);
+CREATE INDEX t500000_3_idx ON t500000 (c_int ASC, c_text ASC);
+CREATE INDEX t500000_4_idx ON t500000 (c_int ASC, c_varchar ASC);
+CREATE INDEX t500000_5_idx ON t500000 (c_float ASC, c_text, c_varchar ASC);
+CREATE INDEX t500000_6_idx ON t500000 (c_float ASC, c_decimal, c_varchar ASC);
+CREATE INDEX t500000_7_idx ON t500000 (c_float ASC, c_real ASC, c_money ASC);
CREATE TABLE t50000
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
@@ -44,16 +44,16 @@ SELECT c_int,
(c_int + 0.2):: float as c_float,
(c_int + 0.3):: real as c_real,
(c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
-CREATE INDEX t50000_1_idx ON t50000 (c_int);
-CREATE INDEX t50000_2_idx ON t50000 (c_int, c_bool);
-CREATE INDEX t50000_3_idx ON t50000 (c_int, c_text);
-CREATE INDEX t50000_4_idx ON t50000 (c_int, c_varchar);
-CREATE INDEX t50000_5_idx ON t50000 (c_float, c_text, c_varchar);
-CREATE INDEX t50000_6_idx ON t50000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t50000_7_idx ON t50000 (c_float, c_real, c_money);
+CREATE INDEX t50000_1_idx ON t50000 (c_int ASC);
+CREATE INDEX t50000_2_idx ON t50000 (c_int ASC, c_bool ASC);
+CREATE INDEX t50000_3_idx ON t50000 (c_int ASC, c_text ASC);
+CREATE INDEX t50000_4_idx ON t50000 (c_int ASC, c_varchar ASC);
+CREATE INDEX t50000_5_idx ON t50000 (c_float ASC, c_text ASC, c_varchar ASC);
+CREATE INDEX t50000_6_idx ON t50000 (c_float ASC, c_decimal ASC, c_varchar ASC);
+CREATE INDEX t50000_7_idx ON t50000 (c_float ASC, c_real ASC, c_money ASC);
CREATE TABLE t100
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
@@ -62,10 +62,10 @@ SELECT c_int,
(c_int + 0.2):: float as c_float,
(c_int + 0.3):: real as c_real,
(c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
-CREATE INDEX t100_1_idx ON t100 (c_int);
-CREATE INDEX t100_2_idx ON t100 (c_int, c_bool);
-CREATE INDEX t100_3_idx ON t100 (c_int, c_text);
-CREATE INDEX t100_4_idx ON t100 (c_int, c_varchar);
-CREATE INDEX t100_5_idx ON t100 (c_float, c_text, c_varchar);
-CREATE INDEX t100_6_idx ON t100 (c_float, c_decimal, c_varchar);
-CREATE INDEX t100_7_idx ON t100 (c_float, c_real, c_money);
\ No newline at end of file
+CREATE INDEX t100_1_idx ON t100 (c_int ASC);
+CREATE INDEX t100_2_idx ON t100 (c_int ASC, c_bool ASC);
+CREATE INDEX t100_3_idx ON t100 (c_int ASC, c_text ASC);
+CREATE INDEX t100_4_idx ON t100 (c_int ASC, c_varchar ASC);
+CREATE INDEX t100_5_idx ON t100 (c_float ASC, c_text ASC, c_varchar ASC);
+CREATE INDEX t100_6_idx ON t100 (c_float ASC, c_decimal ASC, c_varchar ASC);
+CREATE INDEX t100_7_idx ON t100 (c_float ASC, c_real ASC, c_money ASC);
\ No newline at end of file
diff --git a/sql/complex/hash.create.sql b/sql/complex/hash.create.sql
new file mode 100644
index 00000000..b398fa55
--- /dev/null
+++ b/sql/complex/hash.create.sql
@@ -0,0 +1,71 @@
+CREATE TABLE t1000000
+AS
+SELECT c_int,
+ (case when c_int % 2 = 0 then true else false end) as c_bool,
+ (c_int + 0.0001)::text as c_text,
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
+CREATE INDEX t1000000_1_idx ON t1000000 (c_int HASH);
+CREATE INDEX t1000000_2_idx ON t1000000 ((c_int, c_bool) HASH);
+CREATE INDEX t1000000_3_idx ON t1000000 ((c_int, c_text) HASH);
+CREATE INDEX t1000000_4_idx ON t1000000 ((c_int, c_varchar) HASH);
+CREATE INDEX t1000000_5_idx ON t1000000 ((c_float, c_text, c_varchar) HASH);
+CREATE INDEX t1000000_6_idx ON t1000000 ((c_float, c_decimal, c_varchar) HASH);
+CREATE INDEX t1000000_7_idx ON t1000000 ((c_float, c_real, c_money) HASH);
+
+CREATE TABLE t500000
+AS
+SELECT c_int,
+ (case when c_int % 2 = 0 then true else false end) as c_bool,
+ (c_int + 0.0001)::text as c_text,
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series(1, 50000 * $MULTIPLIER) c_int;
+CREATE INDEX t500000_1_idx ON t500000 (c_int HASH);
+CREATE INDEX t500000_2_idx ON t500000 ((c_int, c_bool) HASH);
+CREATE INDEX t500000_3_idx ON t500000 ((c_int, c_text) HASH);
+CREATE INDEX t500000_4_idx ON t500000 ((c_int, c_varchar) HASH);
+CREATE INDEX t500000_5_idx ON t500000 ((c_float, c_text, c_varchar) HASH);
+CREATE INDEX t500000_6_idx ON t500000 ((c_float, c_decimal, c_varchar) HASH);
+CREATE INDEX t500000_7_idx ON t500000 ((c_float, c_real, c_money) HASH);
+
+CREATE TABLE t50000
+AS
+SELECT c_int,
+ (case when c_int % 2 = 0 then true else false end) as c_bool,
+ (c_int + 0.0001)::text as c_text,
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
+CREATE INDEX t50000_1_idx ON t50000 (c_int HASH);
+CREATE INDEX t50000_2_idx ON t50000 ((c_int, c_bool) HASH);
+CREATE INDEX t50000_3_idx ON t50000 ((c_int, c_text) HASH);
+CREATE INDEX t50000_4_idx ON t50000 ((c_int, c_varchar) HASH);
+CREATE INDEX t50000_5_idx ON t50000 ((c_float, c_text, c_varchar) HASH);
+CREATE INDEX t50000_6_idx ON t50000 ((c_float, c_decimal, c_varchar) HASH);
+CREATE INDEX t50000_7_idx ON t50000 ((c_float, c_real, c_money) HASH);
+
+CREATE TABLE t100
+AS
+SELECT c_int,
+ (case when c_int % 2 = 0 then true else false end) as c_bool,
+ (c_int + 0.0001)::text as c_text,
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
+CREATE INDEX t100_1_idx ON t100 (c_int HASH);
+CREATE INDEX t100_2_idx ON t100 ((c_int, c_bool) HASH);
+CREATE INDEX t100_3_idx ON t100 ((c_int, c_text) HASH);
+CREATE INDEX t100_4_idx ON t100 ((c_int, c_varchar) HASH);
+CREATE INDEX t100_5_idx ON t100 ((c_float, c_text, c_varchar) HASH);
+CREATE INDEX t100_6_idx ON t100 ((c_float, c_decimal, c_varchar) HASH);
+CREATE INDEX t100_7_idx ON t100 ((c_float, c_real, c_money) HASH);
\ No newline at end of file
diff --git a/sql/complex/model.conf b/sql/complex/model.conf
new file mode 100644
index 00000000..2eb29eb8
--- /dev/null
+++ b/sql/complex/model.conf
@@ -0,0 +1,2 @@
+all-index-check = false
+load-catalog-tables = false
\ No newline at end of file
diff --git a/sql/complex/obsolete.create.sql b/sql/complex/obsolete.create.sql
deleted file mode 100644
index 5bea7f90..00000000
--- a/sql/complex/obsolete.create.sql
+++ /dev/null
@@ -1,71 +0,0 @@
-CREATE TABLE t1000000
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
-CREATE INDEX t1000000_1_idx ON t1000000 (c_int);
-CREATE INDEX t1000000_2_idx ON t1000000 (c_int, c_bool);
-CREATE INDEX t1000000_3_idx ON t1000000 (c_int, c_text);
-CREATE INDEX t1000000_4_idx ON t1000000 (c_int, c_varchar);
-CREATE INDEX t1000000_5_idx ON t1000000 (c_float, c_text, c_varchar);
-CREATE INDEX t1000000_6_idx ON t1000000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t1000000_7_idx ON t1000000 (c_float, c_real, c_money);
-
-CREATE TABLE t500000
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 50000 * $MULTIPLIER) c_int;
-CREATE INDEX t500000_1_idx ON t500000 (c_int);
-CREATE INDEX t500000_2_idx ON t500000 (c_int, c_bool);
-CREATE INDEX t500000_3_idx ON t500000 (c_int, c_text);
-CREATE INDEX t500000_4_idx ON t500000 (c_int, c_varchar);
-CREATE INDEX t500000_5_idx ON t500000 (c_float, c_text, c_varchar);
-CREATE INDEX t500000_6_idx ON t500000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t500000_7_idx ON t500000 (c_float, c_real, c_money);
-
-CREATE TABLE t50000
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
-CREATE INDEX t50000_1_idx ON t50000 (c_int);
-CREATE INDEX t50000_2_idx ON t50000 (c_int, c_bool);
-CREATE INDEX t50000_3_idx ON t50000 (c_int, c_text);
-CREATE INDEX t50000_4_idx ON t50000 (c_int, c_varchar);
-CREATE INDEX t50000_5_idx ON t50000 (c_float, c_text, c_varchar);
-CREATE INDEX t50000_6_idx ON t50000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t50000_7_idx ON t50000 (c_float, c_real, c_money);
-
-CREATE TABLE t100
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
-CREATE INDEX t100_1_idx ON t100 (c_int);
-CREATE INDEX t100_2_idx ON t100 (c_int, c_bool);
-CREATE INDEX t100_3_idx ON t100 (c_int, c_text);
-CREATE INDEX t100_4_idx ON t100 (c_int, c_varchar);
-CREATE INDEX t100_5_idx ON t100 (c_float, c_text, c_varchar);
-CREATE INDEX t100_6_idx ON t100 (c_float, c_decimal, c_varchar);
-CREATE INDEX t100_7_idx ON t100 (c_float, c_real, c_money);
\ No newline at end of file
diff --git a/sql/complex/postgres.create.sql b/sql/complex/postgres.create.sql
index ef0a3b7e..eb4d4f15 100644
--- a/sql/complex/postgres.create.sql
+++ b/sql/complex/postgres.create.sql
@@ -1,13 +1,13 @@
CREATE TABLE t1000000
- AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
CREATE INDEX t1000000_1_idx ON t1000000 (c_int);
CREATE INDEX t1000000_2_idx ON t1000000 (c_int, c_bool);
CREATE INDEX t1000000_3_idx ON t1000000 (c_int, c_text);
@@ -17,7 +17,7 @@ CREATE INDEX t1000000_6_idx ON t1000000 (c_float, c_decimal, c_varchar);
CREATE INDEX t1000000_7_idx ON t1000000 (c_float, c_real, c_money);
CREATE TABLE t500000
- AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
@@ -35,15 +35,15 @@ CREATE INDEX t500000_6_idx ON t500000 (c_float, c_decimal, c_varchar);
CREATE INDEX t500000_7_idx ON t500000 (c_float, c_real, c_money);
CREATE TABLE t50000
- AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
CREATE INDEX t50000_1_idx ON t50000 (c_int);
CREATE INDEX t50000_2_idx ON t50000 (c_int, c_bool);
CREATE INDEX t50000_3_idx ON t50000 (c_int, c_text);
@@ -53,15 +53,15 @@ CREATE INDEX t50000_6_idx ON t50000 (c_float, c_decimal, c_varchar);
CREATE INDEX t50000_7_idx ON t50000 (c_float, c_real, c_money);
CREATE TABLE t100
- AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
CREATE INDEX t100_1_idx ON t100 (c_int);
CREATE INDEX t100_2_idx ON t100 (c_int, c_bool);
CREATE INDEX t100_3_idx ON t100 (c_int, c_text);
diff --git a/sql/complex/stats.sql b/sql/complex/stats.sql
deleted file mode 100644
index 53a0dc0f..00000000
--- a/sql/complex/stats.sql
+++ /dev/null
@@ -1,4 +0,0 @@
-select (select count(*) from t1000000),
- (select count(*) from t500000),
- (select count(*) from t50000),
- (select count(*) from t100);
\ No newline at end of file
diff --git a/sql/dml/create.sql b/sql/dml/create.sql
index 694eae5b..57c659c9 100644
--- a/sql/dml/create.sql
+++ b/sql/dml/create.sql
@@ -1,13 +1,13 @@
CREATE TABLE t1000000
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 1000000) c_int;
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series(1, 1000000) c_int;
CREATE INDEX t1000000_1_idx ON t1000000 (c_int);
CREATE INDEX t1000000_2_idx ON t1000000 (c_int, c_bool);
CREATE INDEX t1000000_3_idx ON t1000000 (c_int, c_text);
@@ -17,7 +17,7 @@ CREATE INDEX t1000000_6_idx ON t1000000 (c_float, c_decimal, c_varchar);
CREATE INDEX t1000000_7_idx ON t1000000 (c_float, c_real, c_money);
CREATE TABLE t500000
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
@@ -35,15 +35,15 @@ CREATE INDEX t500000_6_idx ON t500000 (c_float, c_decimal, c_varchar);
CREATE INDEX t500000_7_idx ON t500000 (c_float, c_real, c_money);
CREATE TABLE t50000
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 50000) c_int;
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 50000) c_int;
CREATE INDEX t50000_1_idx ON t50000 (c_int);
CREATE INDEX t50000_2_idx ON t50000 (c_int, c_bool);
CREATE INDEX t50000_3_idx ON t50000 (c_int, c_text);
@@ -53,15 +53,15 @@ CREATE INDEX t50000_6_idx ON t50000 (c_float, c_decimal, c_varchar);
CREATE INDEX t50000_7_idx ON t50000 (c_float, c_real, c_money);
CREATE TABLE t100
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 100) c_int;
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 100) c_int;
CREATE INDEX t100_1_idx ON t100 (c_int);
CREATE INDEX t100_2_idx ON t100 (c_int, c_bool);
CREATE INDEX t100_3_idx ON t100 (c_int, c_text);
diff --git a/sql/dml/model.conf b/sql/dml/model.conf
new file mode 100644
index 00000000..1a761d3f
--- /dev/null
+++ b/sql/dml/model.conf
@@ -0,0 +1 @@
+load-catalog-tables = false
\ No newline at end of file
diff --git a/sql/dml/obsolete.create.sql b/sql/dml/obsolete.create.sql
deleted file mode 100644
index 00c56cb2..00000000
--- a/sql/dml/obsolete.create.sql
+++ /dev/null
@@ -1,71 +0,0 @@
-CREATE TABLE t1000000
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 1000000) c_int;
-CREATE INDEX t1000000_1_idx ON t1000000 (c_int);
-CREATE INDEX t1000000_2_idx ON t1000000 (c_int, c_bool);
-CREATE INDEX t1000000_3_idx ON t1000000 (c_int, c_text);
-CREATE INDEX t1000000_4_idx ON t1000000 (c_int, c_varchar);
-CREATE INDEX t1000000_5_idx ON t1000000 (c_float, c_text, c_varchar);
-CREATE INDEX t1000000_6_idx ON t1000000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t1000000_7_idx ON t1000000 (c_float, c_real, c_money);
-
-CREATE TABLE t500000
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 500000) c_int;
-CREATE INDEX t500000_1_idx ON t500000 (c_int);
-CREATE INDEX t500000_2_idx ON t500000 (c_int, c_bool);
-CREATE INDEX t500000_3_idx ON t500000 (c_int, c_text);
-CREATE INDEX t500000_4_idx ON t500000 (c_int, c_varchar);
-CREATE INDEX t500000_5_idx ON t500000 (c_float, c_text, c_varchar);
-CREATE INDEX t500000_6_idx ON t500000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t500000_7_idx ON t500000 (c_float, c_real, c_money);
-
-CREATE TABLE t50000
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 50000) c_int;
-CREATE INDEX t50000_1_idx ON t50000 (c_int);
-CREATE INDEX t50000_2_idx ON t50000 (c_int, c_bool);
-CREATE INDEX t50000_3_idx ON t50000 (c_int, c_text);
-CREATE INDEX t50000_4_idx ON t50000 (c_int, c_varchar);
-CREATE INDEX t50000_5_idx ON t50000 (c_float, c_text, c_varchar);
-CREATE INDEX t50000_6_idx ON t50000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t50000_7_idx ON t50000 (c_float, c_real, c_money);
-
-CREATE TABLE t100
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 100) c_int;
-CREATE INDEX t100_1_idx ON t100 (c_int);
-CREATE INDEX t100_2_idx ON t100 (c_int, c_bool);
-CREATE INDEX t100_3_idx ON t100 (c_int, c_text);
-CREATE INDEX t100_4_idx ON t100 (c_int, c_varchar);
-CREATE INDEX t100_5_idx ON t100 (c_float, c_text, c_varchar);
-CREATE INDEX t100_6_idx ON t100 (c_float, c_decimal, c_varchar);
-CREATE INDEX t100_7_idx ON t100 (c_float, c_real, c_money);
\ No newline at end of file
diff --git a/sql/dml/postgres.create.sql b/sql/dml/postgres.create.sql
deleted file mode 100644
index e8b8af1d..00000000
--- a/sql/dml/postgres.create.sql
+++ /dev/null
@@ -1,71 +0,0 @@
-CREATE TABLE t1000000
- AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 1000000) c_int;
-CREATE INDEX t1000000_1_idx ON t1000000 (c_int);
-CREATE INDEX t1000000_2_idx ON t1000000 (c_int, c_bool);
-CREATE INDEX t1000000_3_idx ON t1000000 (c_int, c_text);
-CREATE INDEX t1000000_4_idx ON t1000000 (c_int, c_varchar);
-CREATE INDEX t1000000_5_idx ON t1000000 (c_float, c_text, c_varchar);
-CREATE INDEX t1000000_6_idx ON t1000000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t1000000_7_idx ON t1000000 (c_float, c_real, c_money);
-
-CREATE TABLE t500000
- AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 500000) c_int;
-CREATE INDEX t500000_1_idx ON t500000 (c_int);
-CREATE INDEX t500000_2_idx ON t500000 (c_int, c_bool);
-CREATE INDEX t500000_3_idx ON t500000 (c_int, c_text);
-CREATE INDEX t500000_4_idx ON t500000 (c_int, c_varchar);
-CREATE INDEX t500000_5_idx ON t500000 (c_float, c_text, c_varchar);
-CREATE INDEX t500000_6_idx ON t500000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t500000_7_idx ON t500000 (c_float, c_real, c_money);
-
-CREATE TABLE t50000
- AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 50000) c_int;
-CREATE INDEX t50000_1_idx ON t50000 (c_int);
-CREATE INDEX t50000_2_idx ON t50000 (c_int, c_bool);
-CREATE INDEX t50000_3_idx ON t50000 (c_int, c_text);
-CREATE INDEX t50000_4_idx ON t50000 (c_int, c_varchar);
-CREATE INDEX t50000_5_idx ON t50000 (c_float, c_text, c_varchar);
-CREATE INDEX t50000_6_idx ON t50000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t50000_7_idx ON t50000 (c_float, c_real, c_money);
-
-CREATE TABLE t100
- AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 100) c_int;
-CREATE INDEX t100_1_idx ON t100 (c_int);
-CREATE INDEX t100_2_idx ON t100 (c_int, c_bool);
-CREATE INDEX t100_3_idx ON t100 (c_int, c_text);
-CREATE INDEX t100_4_idx ON t100 (c_int, c_varchar);
-CREATE INDEX t100_5_idx ON t100 (c_float, c_text, c_varchar);
-CREATE INDEX t100_6_idx ON t100 (c_float, c_decimal, c_varchar);
-CREATE INDEX t100_7_idx ON t100 (c_float, c_real, c_money);
\ No newline at end of file
diff --git a/sql/dml/stats.sql b/sql/dml/stats.sql
deleted file mode 100644
index 53a0dc0f..00000000
--- a/sql/dml/stats.sql
+++ /dev/null
@@ -1,4 +0,0 @@
-select (select count(*) from t1000000),
- (select count(*) from t500000),
- (select count(*) from t50000),
- (select count(*) from t100);
\ No newline at end of file
diff --git a/sql/join-order-benchmark/create.sql b/sql/join-order-benchmark/create.sql
index a3f844f5..e0935b91 100644
--- a/sql/join-order-benchmark/create.sql
+++ b/sql/join-order-benchmark/create.sql
@@ -1,194 +1,215 @@
CREATE TABLE aka_name (
- id integer NOT NULL PRIMARY KEY,
- person_id integer NOT NULL,
- name text NOT NULL,
- imdb_index character varying(12),
- name_pcode_cf character varying(5),
- name_pcode_nf character varying(5),
- surname_pcode character varying(5),
- md5sum character varying(32)
-) WITH (colocation = true);
+ id integer NOT NULL,
+ person_id integer NOT NULL,
+ name text NOT NULL,
+ imdb_index character varying(12),
+ name_pcode_cf character varying(5),
+ name_pcode_nf character varying(5),
+ surname_pcode character varying(5),
+ md5sum character varying(32),
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE aka_title (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- title text NOT NULL,
- imdb_index character varying(12),
- kind_id integer NOT NULL,
- production_year integer,
- phonetic_code character varying(5),
- episode_of_id integer,
- season_nr integer,
- episode_nr integer,
- note text,
- md5sum character varying(32)
-) WITH (colocation = true);
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ title text NOT NULL,
+ imdb_index character varying(12),
+ kind_id integer NOT NULL,
+ production_year integer,
+ phonetic_code character varying(5),
+ episode_of_id integer,
+ season_nr integer,
+ episode_nr integer,
+ note text,
+ md5sum character varying(32),
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE cast_info (
- id integer NOT NULL PRIMARY KEY,
- person_id integer NOT NULL,
- movie_id integer NOT NULL,
- person_role_id integer,
- note text,
- nr_order integer,
- role_id integer NOT NULL
-) WITH (colocation = true);
+ id integer NOT NULL,
+ person_id integer NOT NULL,
+ movie_id integer NOT NULL,
+ person_role_id integer,
+ note text,
+ nr_order integer,
+ role_id integer NOT NULL,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE char_name (
- id integer NOT NULL PRIMARY KEY,
- name text NOT NULL,
- imdb_index character varying(12),
- imdb_id integer,
- name_pcode_nf character varying(5),
- surname_pcode character varying(5),
- md5sum character varying(32)
-) WITH (colocation = true);
+ id integer NOT NULL,
+ name text NOT NULL,
+ imdb_index character varying(12),
+ imdb_id integer,
+ name_pcode_nf character varying(5),
+ surname_pcode character varying(5),
+ md5sum character varying(32),
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE comp_cast_type (
- id integer NOT NULL PRIMARY KEY,
- kind character varying(32) NOT NULL
-) WITH (colocation = true);
+ id integer NOT NULL,
+ kind character varying(32) NOT NULL,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE company_name (
- id integer NOT NULL PRIMARY KEY,
- name text NOT NULL,
- country_code character varying(255),
- imdb_id integer,
- name_pcode_nf character varying(5),
- name_pcode_sf character varying(5),
- md5sum character varying(32)
-) WITH (colocation = true);
+ id integer NOT NULL,
+ name text NOT NULL,
+ country_code character varying(255),
+ imdb_id integer,
+ name_pcode_nf character varying(5),
+ name_pcode_sf character varying(5),
+ md5sum character varying(32),
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE company_type (
- id integer NOT NULL PRIMARY KEY,
- kind character varying(32) NOT NULL
+ id integer NOT NULL,
+ kind character varying(32) NOT NULL,
+ PRIMARY KEY (id ASC)
);
CREATE TABLE complete_cast (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer,
- subject_id integer NOT NULL,
- status_id integer NOT NULL
-) WITH (colocation = true);
+ id integer NOT NULL,
+ movie_id integer,
+ subject_id integer NOT NULL,
+ status_id integer NOT NULL,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE info_type (
- id integer NOT NULL PRIMARY KEY,
- info character varying(32) NOT NULL
-) WITH (colocation = true);
+ id integer NOT NULL,
+ info character varying(32) NOT NULL,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE keyword (
- id integer NOT NULL PRIMARY KEY,
- keyword text NOT NULL,
- phonetic_code character varying(5)
-) WITH (colocation = true);
+ id integer NOT NULL,
+ keyword text NOT NULL,
+ phonetic_code character varying(5),
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE kind_type (
- id integer NOT NULL PRIMARY KEY,
- kind character varying(15) NOT NULL
-) WITH (colocation = true);
+ id integer NOT NULL,
+ kind character varying(15) NOT NULL,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE link_type (
- id integer NOT NULL PRIMARY KEY,
- link character varying(32) NOT NULL
+ id integer NOT NULL,
+ link character varying(32) NOT NULL,
+ PRIMARY KEY (id ASC)
);
CREATE TABLE movie_companies (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- company_id integer NOT NULL,
- company_type_id integer NOT NULL,
- note text
-) WITH (colocation = true);
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ company_id integer NOT NULL,
+ company_type_id integer NOT NULL,
+ note text,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE movie_info (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- info_type_id integer NOT NULL,
- info text NOT NULL,
- note text
-) WITH (colocation = true);
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ info_type_id integer NOT NULL,
+ info text NOT NULL,
+ note text,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE movie_info_idx (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- info_type_id integer NOT NULL,
- info text NOT NULL,
- note text
-) WITH (colocation = true);
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ info_type_id integer NOT NULL,
+ info text NOT NULL,
+ note text,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE movie_keyword (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- keyword_id integer NOT NULL
-) WITH (colocation = true);
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ keyword_id integer NOT NULL,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE movie_link (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- linked_movie_id integer NOT NULL,
- link_type_id integer NOT NULL
-) WITH (colocation = true);
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ linked_movie_id integer NOT NULL,
+ link_type_id integer NOT NULL,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE name (
- id integer NOT NULL PRIMARY KEY,
- name text NOT NULL,
- imdb_index character varying(12),
- imdb_id integer,
- gender character varying(1),
- name_pcode_cf character varying(5),
- name_pcode_nf character varying(5),
- surname_pcode character varying(5),
- md5sum character varying(32)
-) WITH (colocation = true);
+ id integer NOT NULL,
+ name text NOT NULL,
+ imdb_index character varying(12),
+ imdb_id integer,
+ gender character varying(1),
+ name_pcode_cf character varying(5),
+ name_pcode_nf character varying(5),
+ surname_pcode character varying(5),
+ md5sum character varying(32),
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE person_info (
- id integer NOT NULL PRIMARY KEY,
- person_id integer NOT NULL,
- info_type_id integer NOT NULL,
- info text NOT NULL,
- note text
-) WITH (colocation = true);
+ id integer NOT NULL,
+ person_id integer NOT NULL,
+ info_type_id integer NOT NULL,
+ info text NOT NULL,
+ note text,
+ PRIMARY KEY (id ASC)
+);
CREATE TABLE role_type (
- id integer NOT NULL PRIMARY KEY,
- role character varying(32) NOT NULL
+ id integer NOT NULL,
+ role character varying(32) NOT NULL,
+ PRIMARY KEY (id ASC)
);
CREATE TABLE title (
- id integer NOT NULL PRIMARY KEY,
- title text NOT NULL,
- imdb_index character varying(12),
- kind_id integer NOT NULL,
- production_year integer,
- imdb_id integer,
- phonetic_code character varying(5),
- episode_of_id integer,
- season_nr integer,
- episode_nr integer,
- series_years character varying(49),
- md5sum character varying(32)
-) WITH (colocation = true);
-
-create index company_id_movie_companies on movie_companies(company_id);
-create index company_type_id_movie_companies on movie_companies(company_type_id);
-create index info_type_id_movie_info_idx on movie_info_idx(info_type_id);
-create index info_type_id_movie_info on movie_info(info_type_id);
-create index info_type_id_person_info on person_info(info_type_id);
-create index keyword_id_movie_keyword on movie_keyword(keyword_id);
-create index kind_id_aka_title on aka_title(kind_id);
-create index kind_id_title on title(kind_id);
-create index linked_movie_id_movie_link on movie_link(linked_movie_id);
-create index link_type_id_movie_link on movie_link(link_type_id);
-create index movie_id_aka_title on aka_title(movie_id);
-create index movie_id_cast_info on cast_info(movie_id);
-create index movie_id_complete_cast on complete_cast(movie_id);
-create index movie_id_movie_companies on movie_companies(movie_id);
-create index movie_id_movie_info_idx on movie_info_idx(movie_id);
-create index movie_id_movie_keyword on movie_keyword(movie_id);
-create index movie_id_movie_link on movie_link(movie_id);
-create index movie_id_movie_info on movie_info(movie_id);
-create index person_id_aka_name on aka_name(person_id);
-create index person_id_cast_info on cast_info(person_id);
-create index person_id_person_info on person_info(person_id);
-create index person_role_id_cast_info on cast_info(person_role_id);
-create index role_id_cast_info on cast_info(role_id);
+ id integer NOT NULL,
+ title text NOT NULL,
+ imdb_index character varying(12),
+ kind_id integer NOT NULL,
+ production_year integer,
+ imdb_id integer,
+ phonetic_code character varying(5),
+ episode_of_id integer,
+ season_nr integer,
+ episode_nr integer,
+ series_years character varying(49),
+ md5sum character varying(32),
+ PRIMARY KEY (id ASC)
+);
+
+create index company_id_movie_companies on movie_companies(company_id ASC);
+create index company_type_id_movie_companies on movie_companies(company_type_id ASC);
+create index info_type_id_movie_info_idx on movie_info_idx(info_type_id ASC);
+create index info_type_id_movie_info on movie_info(info_type_id ASC);
+create index info_type_id_person_info on person_info(info_type_id ASC);
+create index keyword_id_movie_keyword on movie_keyword(keyword_id ASC);
+create index kind_id_aka_title on aka_title(kind_id ASC);
+create index kind_id_title on title(kind_id ASC);
+create index linked_movie_id_movie_link on movie_link(linked_movie_id ASC);
+create index link_type_id_movie_link on movie_link(link_type_id ASC);
+create index movie_id_aka_title on aka_title(movie_id ASC);
+create index movie_id_cast_info on cast_info(movie_id ASC);
+create index movie_id_complete_cast on complete_cast(movie_id ASC);
+create index movie_id_movie_companies on movie_companies(movie_id ASC);
+create index movie_id_movie_info_idx on movie_info_idx(movie_id ASC);
+create index movie_id_movie_keyword on movie_keyword(movie_id ASC);
+create index movie_id_movie_link on movie_link(movie_id ASC);
+create index movie_id_movie_info on movie_info(movie_id ASC);
+create index person_id_aka_name on aka_name(person_id ASC);
+create index person_id_cast_info on cast_info(person_id ASC);
+create index person_id_person_info on person_info(person_id ASC);
+create index person_role_id_cast_info on cast_info(person_role_id ASC);
+create index role_id_cast_info on cast_info(role_id ASC);
diff --git a/sql/join-order-benchmark/hash.create.sql b/sql/join-order-benchmark/hash.create.sql
new file mode 100644
index 00000000..560aa75b
--- /dev/null
+++ b/sql/join-order-benchmark/hash.create.sql
@@ -0,0 +1,215 @@
+CREATE TABLE aka_name (
+ id integer NOT NULL,
+ person_id integer NOT NULL,
+ name text NOT NULL,
+ imdb_index character varying(12),
+ name_pcode_cf character varying(5),
+ name_pcode_nf character varying(5),
+ surname_pcode character varying(5),
+ md5sum character varying(32),
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE aka_title (
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ title text NOT NULL,
+ imdb_index character varying(12),
+ kind_id integer NOT NULL,
+ production_year integer,
+ phonetic_code character varying(5),
+ episode_of_id integer,
+ season_nr integer,
+ episode_nr integer,
+ note text,
+ md5sum character varying(32),
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE cast_info (
+ id integer NOT NULL,
+ person_id integer NOT NULL,
+ movie_id integer NOT NULL,
+ person_role_id integer,
+ note text,
+ nr_order integer,
+ role_id integer NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE char_name (
+ id integer NOT NULL,
+ name text NOT NULL,
+ imdb_index character varying(12),
+ imdb_id integer,
+ name_pcode_nf character varying(5),
+ surname_pcode character varying(5),
+ md5sum character varying(32),
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE comp_cast_type (
+ id integer NOT NULL,
+ kind character varying(32) NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE company_name (
+ id integer NOT NULL,
+ name text NOT NULL,
+ country_code character varying(255),
+ imdb_id integer,
+ name_pcode_nf character varying(5),
+ name_pcode_sf character varying(5),
+ md5sum character varying(32),
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE company_type (
+ id integer NOT NULL,
+ kind character varying(32) NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE complete_cast (
+ id integer NOT NULL,
+ movie_id integer,
+ subject_id integer NOT NULL,
+ status_id integer NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE info_type (
+ id integer NOT NULL,
+ info character varying(32) NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE keyword (
+ id integer NOT NULL,
+ keyword text NOT NULL,
+ phonetic_code character varying(5),
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE kind_type (
+ id integer NOT NULL,
+ kind character varying(15) NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE link_type (
+ id integer NOT NULL,
+ link character varying(32) NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE movie_companies (
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ company_id integer NOT NULL,
+ company_type_id integer NOT NULL,
+ note text,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE movie_info (
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ info_type_id integer NOT NULL,
+ info text NOT NULL,
+ note text,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE movie_info_idx (
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ info_type_id integer NOT NULL,
+ info text NOT NULL,
+ note text,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE movie_keyword (
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ keyword_id integer NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE movie_link (
+ id integer NOT NULL,
+ movie_id integer NOT NULL,
+ linked_movie_id integer NOT NULL,
+ link_type_id integer NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE name (
+ id integer NOT NULL,
+ name text NOT NULL,
+ imdb_index character varying(12),
+ imdb_id integer,
+ gender character varying(1),
+ name_pcode_cf character varying(5),
+ name_pcode_nf character varying(5),
+ surname_pcode character varying(5),
+ md5sum character varying(32),
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE person_info (
+ id integer NOT NULL,
+ person_id integer NOT NULL,
+ info_type_id integer NOT NULL,
+ info text NOT NULL,
+ note text,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE role_type (
+ id integer NOT NULL,
+ role character varying(32) NOT NULL,
+ PRIMARY KEY (id HASH)
+);
+
+CREATE TABLE title (
+ id integer NOT NULL,
+ title text NOT NULL,
+ imdb_index character varying(12),
+ kind_id integer NOT NULL,
+ production_year integer,
+ imdb_id integer,
+ phonetic_code character varying(5),
+ episode_of_id integer,
+ season_nr integer,
+ episode_nr integer,
+ series_years character varying(49),
+ md5sum character varying(32),
+ PRIMARY KEY (id HASH)
+);
+
+create index company_id_movie_companies on movie_companies(company_id HASH);
+create index company_type_id_movie_companies on movie_companies(company_type_id HASH);
+create index info_type_id_movie_info_idx on movie_info_idx(info_type_id HASH);
+create index info_type_id_movie_info on movie_info(info_type_id HASH);
+create index info_type_id_person_info on person_info(info_type_id HASH);
+create index keyword_id_movie_keyword on movie_keyword(keyword_id HASH);
+create index kind_id_aka_title on aka_title(kind_id HASH);
+create index kind_id_title on title(kind_id HASH);
+create index linked_movie_id_movie_link on movie_link(linked_movie_id HASH);
+create index link_type_id_movie_link on movie_link(link_type_id HASH);
+create index movie_id_aka_title on aka_title(movie_id HASH);
+create index movie_id_cast_info on cast_info(movie_id HASH);
+create index movie_id_complete_cast on complete_cast(movie_id HASH);
+create index movie_id_movie_companies on movie_companies(movie_id HASH);
+create index movie_id_movie_info_idx on movie_info_idx(movie_id HASH);
+create index movie_id_movie_keyword on movie_keyword(movie_id HASH);
+create index movie_id_movie_link on movie_link(movie_id HASH);
+create index movie_id_movie_info on movie_info(movie_id HASH);
+create index person_id_aka_name on aka_name(person_id HASH);
+create index person_id_cast_info on cast_info(person_id HASH);
+create index person_id_person_info on person_info(person_id HASH);
+create index person_role_id_cast_info on cast_info(person_role_id HASH);
+create index role_id_cast_info on cast_info(role_id HASH);
diff --git a/sql/join-order-benchmark/import.py b/sql/join-order-benchmark/import.py
new file mode 100644
index 00000000..5c0d52dd
--- /dev/null
+++ b/sql/join-order-benchmark/import.py
@@ -0,0 +1,91 @@
+import argparse
+import re
+from tqdm import tqdm
+
+import psycopg2
+
+
+def import_from_local(cur, cleaned):
+ copy_re = r"(?i)\bCOPY\b\s(.+)\s\bFROM\b\s\'(.*)\'\s\bWITH\b\s\((.*\,?)\)"
+ parse_re = re.findall(copy_re, cleaned, re.MULTILINE)[0]
+ table_name = parse_re[0]
+ local_path = parse_re[1]
+ params = parse_re[2]
+
+ delimiter = ","
+ file_format = None
+ null_format = ''
+ if 'delimiter' in params.lower():
+ delimiter = re.findall(r"(?i)delimiter\s\'(.{1,3})\'", params)[0]
+ if delimiter == "\\t":
+ delimiter = "\t"
+ if 'format' in params.lower():
+ file_format = re.findall(r"(?i)format\s([a-zA-Z]+)", params)[0]
+ if 'null' in params.lower():
+ null_format = re.findall(r"(?i)null\s\'([a-zA-Z]+)\'", params)[0]
+
+ if 'csv' not in file_format.lower():
+ raise AttributeError("Can't import from non CSV files")
+
+ with open(local_path, "r") as csv_file:
+ cur.copy_from(csv_file, table_name,
+ sep=delimiter,
+ null=null_format)
+
+
+def apply_variables(data_path, queries_str):
+ variables = {
+ "$DATA_PATH": data_path
+ }
+
+ for variable_name, variable_value in variables.items():
+ if variable_value:
+ queries_str = queries_str.replace(variable_name,
+ str(variable_value))
+
+ return queries_str
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description='Query Optimizer Testing framework for PostgreSQL compatible DBs')
+
+ parser.add_argument('--host',
+ default="127.0.0.1",
+ help='Target host IP for postgres compatible database')
+ parser.add_argument('--port',
+ default=5433,
+ help='Target port for postgres compatible database')
+ parser.add_argument('--username',
+ default="yugabyte",
+ help='Username for connection')
+ parser.add_argument('--password',
+ default="yugabyte",
+ help='Password for user for connection')
+ parser.add_argument('--database',
+ default="taqo",
+ help='Target database in postgres compatible database')
+
+ parser.add_argument('--import_file',
+ default="import.sql",
+ help='Import file path')
+ parser.add_argument('--data_path',
+ help='Data folder path')
+
+ args = parser.parse_args()
+
+ with psycopg2.connect(
+ host=args.host,
+ port=args.port,
+ database=args.database,
+ user=args.username,
+ password=args.password) as conn:
+ with conn.cursor() as cur:
+ with open(f"{args.import_file}", "r") as sql_file:
+ full_queries = apply_variables(args.data_path, '\n'.join(sql_file.readlines()))
+ for query in tqdm(full_queries.split(";")):
+ try:
+ if cleaned := query.lstrip():
+ import_from_local(cur, cleaned)
+ except Exception as e:
+ raise e
diff --git a/sql/join-order-benchmark/model.conf b/sql/join-order-benchmark/model.conf
new file mode 100644
index 00000000..eefa435f
--- /dev/null
+++ b/sql/join-order-benchmark/model.conf
@@ -0,0 +1,3 @@
+all-index-check = false
+load-catalog-tables = false
+compaction-timeout = 2400
\ No newline at end of file
diff --git a/sql/join-order-benchmark/obsolete.create.sql b/sql/join-order-benchmark/obsolete.create.sql
deleted file mode 100644
index 18041954..00000000
--- a/sql/join-order-benchmark/obsolete.create.sql
+++ /dev/null
@@ -1,194 +0,0 @@
-CREATE TABLE aka_name (
- id integer NOT NULL PRIMARY KEY,
- person_id integer NOT NULL,
- name text NOT NULL,
- imdb_index character varying(12),
- name_pcode_cf character varying(5),
- name_pcode_nf character varying(5),
- surname_pcode character varying(5),
- md5sum character varying(32)
-) WITH (colocated = true);
-
-CREATE TABLE aka_title (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- title text NOT NULL,
- imdb_index character varying(12),
- kind_id integer NOT NULL,
- production_year integer,
- phonetic_code character varying(5),
- episode_of_id integer,
- season_nr integer,
- episode_nr integer,
- note text,
- md5sum character varying(32)
-) WITH (colocated = true);
-
-CREATE TABLE cast_info (
- id integer NOT NULL PRIMARY KEY,
- person_id integer NOT NULL,
- movie_id integer NOT NULL,
- person_role_id integer,
- note text,
- nr_order integer,
- role_id integer NOT NULL
-) WITH (colocated = true);
-
-CREATE TABLE char_name (
- id integer NOT NULL PRIMARY KEY,
- name text NOT NULL,
- imdb_index character varying(12),
- imdb_id integer,
- name_pcode_nf character varying(5),
- surname_pcode character varying(5),
- md5sum character varying(32)
-) WITH (colocated = true);
-
-CREATE TABLE comp_cast_type (
- id integer NOT NULL PRIMARY KEY,
- kind character varying(32) NOT NULL
-) WITH (colocated = true);
-
-CREATE TABLE company_name (
- id integer NOT NULL PRIMARY KEY,
- name text NOT NULL,
- country_code character varying(255),
- imdb_id integer,
- name_pcode_nf character varying(5),
- name_pcode_sf character varying(5),
- md5sum character varying(32)
-) WITH (colocated = true);
-
-CREATE TABLE company_type (
- id integer NOT NULL PRIMARY KEY,
- kind character varying(32) NOT NULL
-);
-
-CREATE TABLE complete_cast (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer,
- subject_id integer NOT NULL,
- status_id integer NOT NULL
-) WITH (colocated = true);
-
-CREATE TABLE info_type (
- id integer NOT NULL PRIMARY KEY,
- info character varying(32) NOT NULL
-) WITH (colocated = true);
-
-CREATE TABLE keyword (
- id integer NOT NULL PRIMARY KEY,
- keyword text NOT NULL,
- phonetic_code character varying(5)
-) WITH (colocated = true);
-
-CREATE TABLE kind_type (
- id integer NOT NULL PRIMARY KEY,
- kind character varying(15) NOT NULL
-) WITH (colocated = true);
-
-CREATE TABLE link_type (
- id integer NOT NULL PRIMARY KEY,
- link character varying(32) NOT NULL
-);
-
-CREATE TABLE movie_companies (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- company_id integer NOT NULL,
- company_type_id integer NOT NULL,
- note text
-) WITH (colocated = true);
-
-CREATE TABLE movie_info (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- info_type_id integer NOT NULL,
- info text NOT NULL,
- note text
-) WITH (colocated = true);
-
-CREATE TABLE movie_info_idx (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- info_type_id integer NOT NULL,
- info text NOT NULL,
- note text
-) WITH (colocated = true);
-
-CREATE TABLE movie_keyword (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- keyword_id integer NOT NULL
-) WITH (colocated = true);
-
-CREATE TABLE movie_link (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- linked_movie_id integer NOT NULL,
- link_type_id integer NOT NULL
-) WITH (colocated = true);
-
-CREATE TABLE name (
- id integer NOT NULL PRIMARY KEY,
- name text NOT NULL,
- imdb_index character varying(12),
- imdb_id integer,
- gender character varying(1),
- name_pcode_cf character varying(5),
- name_pcode_nf character varying(5),
- surname_pcode character varying(5),
- md5sum character varying(32)
-) WITH (colocated = true);
-
-CREATE TABLE person_info (
- id integer NOT NULL PRIMARY KEY,
- person_id integer NOT NULL,
- info_type_id integer NOT NULL,
- info text NOT NULL,
- note text
-) WITH (colocated = true);
-
-CREATE TABLE role_type (
- id integer NOT NULL PRIMARY KEY,
- role character varying(32) NOT NULL
-);
-
-CREATE TABLE title (
- id integer NOT NULL PRIMARY KEY,
- title text NOT NULL,
- imdb_index character varying(12),
- kind_id integer NOT NULL,
- production_year integer,
- imdb_id integer,
- phonetic_code character varying(5),
- episode_of_id integer,
- season_nr integer,
- episode_nr integer,
- series_years character varying(49),
- md5sum character varying(32)
-) WITH (colocated = true);
-
-create index company_id_movie_companies on movie_companies(company_id);
-create index company_type_id_movie_companies on movie_companies(company_type_id);
-create index info_type_id_movie_info_idx on movie_info_idx(info_type_id);
-create index info_type_id_movie_info on movie_info(info_type_id);
-create index info_type_id_person_info on person_info(info_type_id);
-create index keyword_id_movie_keyword on movie_keyword(keyword_id);
-create index kind_id_aka_title on aka_title(kind_id);
-create index kind_id_title on title(kind_id);
-create index linked_movie_id_movie_link on movie_link(linked_movie_id);
-create index link_type_id_movie_link on movie_link(link_type_id);
-create index movie_id_aka_title on aka_title(movie_id);
-create index movie_id_cast_info on cast_info(movie_id);
-create index movie_id_complete_cast on complete_cast(movie_id);
-create index movie_id_movie_companies on movie_companies(movie_id);
-create index movie_id_movie_info_idx on movie_info_idx(movie_id);
-create index movie_id_movie_keyword on movie_keyword(movie_id);
-create index movie_id_movie_link on movie_link(movie_id);
-create index movie_id_movie_info on movie_info(movie_id);
-create index person_id_aka_name on aka_name(person_id);
-create index person_id_cast_info on cast_info(person_id);
-create index person_id_person_info on person_info(person_id);
-create index person_role_id_cast_info on cast_info(person_role_id);
-create index role_id_cast_info on cast_info(role_id);
diff --git a/sql/join-order-benchmark/postgres.create.sql b/sql/join-order-benchmark/postgres.create.sql
index 5224863a..b9f5876e 100644
--- a/sql/join-order-benchmark/postgres.create.sql
+++ b/sql/join-order-benchmark/postgres.create.sql
@@ -1,172 +1,172 @@
CREATE TABLE aka_name (
- id integer NOT NULL PRIMARY KEY,
- person_id integer NOT NULL,
- name text NOT NULL,
- imdb_index character varying(12),
- name_pcode_cf character varying(5),
- name_pcode_nf character varying(5),
- surname_pcode character varying(5),
- md5sum character varying(32)
+ id integer NOT NULL PRIMARY KEY,
+ person_id integer NOT NULL,
+ name text NOT NULL,
+ imdb_index character varying(12),
+ name_pcode_cf character varying(5),
+ name_pcode_nf character varying(5),
+ surname_pcode character varying(5),
+ md5sum character varying(32)
);
CREATE TABLE aka_title (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- title text NOT NULL,
- imdb_index character varying(12),
- kind_id integer NOT NULL,
- production_year integer,
- phonetic_code character varying(5),
- episode_of_id integer,
- season_nr integer,
- episode_nr integer,
- note text,
- md5sum character varying(32)
+ id integer NOT NULL PRIMARY KEY,
+ movie_id integer NOT NULL,
+ title text NOT NULL,
+ imdb_index character varying(12),
+ kind_id integer NOT NULL,
+ production_year integer,
+ phonetic_code character varying(5),
+ episode_of_id integer,
+ season_nr integer,
+ episode_nr integer,
+ note text,
+ md5sum character varying(32)
);
CREATE TABLE cast_info (
- id integer NOT NULL PRIMARY KEY,
- person_id integer NOT NULL,
- movie_id integer NOT NULL,
- person_role_id integer,
- note text,
- nr_order integer,
- role_id integer NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ person_id integer NOT NULL,
+ movie_id integer NOT NULL,
+ person_role_id integer,
+ note text,
+ nr_order integer,
+ role_id integer NOT NULL
);
CREATE TABLE char_name (
- id integer NOT NULL PRIMARY KEY,
- name text NOT NULL,
- imdb_index character varying(12),
- imdb_id integer,
- name_pcode_nf character varying(5),
- surname_pcode character varying(5),
- md5sum character varying(32)
+ id integer NOT NULL PRIMARY KEY,
+ name text NOT NULL,
+ imdb_index character varying(12),
+ imdb_id integer,
+ name_pcode_nf character varying(5),
+ surname_pcode character varying(5),
+ md5sum character varying(32)
);
CREATE TABLE comp_cast_type (
- id integer NOT NULL PRIMARY KEY,
- kind character varying(32) NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ kind character varying(32) NOT NULL
);
CREATE TABLE company_name (
- id integer NOT NULL PRIMARY KEY,
- name text NOT NULL,
- country_code character varying(255),
- imdb_id integer,
- name_pcode_nf character varying(5),
- name_pcode_sf character varying(5),
- md5sum character varying(32)
+ id integer NOT NULL PRIMARY KEY,
+ name text NOT NULL,
+ country_code character varying(255),
+ imdb_id integer,
+ name_pcode_nf character varying(5),
+ name_pcode_sf character varying(5),
+ md5sum character varying(32)
);
CREATE TABLE company_type (
- id integer NOT NULL PRIMARY KEY,
- kind character varying(32) NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ kind character varying(32) NOT NULL
);
CREATE TABLE complete_cast (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer,
- subject_id integer NOT NULL,
- status_id integer NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ movie_id integer,
+ subject_id integer NOT NULL,
+ status_id integer NOT NULL
);
CREATE TABLE info_type (
- id integer NOT NULL PRIMARY KEY,
- info character varying(32) NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ info character varying(32) NOT NULL
);
CREATE TABLE keyword (
- id integer NOT NULL PRIMARY KEY,
- keyword text NOT NULL,
- phonetic_code character varying(5)
+ id integer NOT NULL PRIMARY KEY,
+ keyword text NOT NULL,
+ phonetic_code character varying(5)
);
CREATE TABLE kind_type (
- id integer NOT NULL PRIMARY KEY,
- kind character varying(15) NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ kind character varying(15) NOT NULL
);
CREATE TABLE link_type (
- id integer NOT NULL PRIMARY KEY,
- link character varying(32) NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ link character varying(32) NOT NULL
);
CREATE TABLE movie_companies (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- company_id integer NOT NULL,
- company_type_id integer NOT NULL,
- note text
+ id integer NOT NULL PRIMARY KEY,
+ movie_id integer NOT NULL,
+ company_id integer NOT NULL,
+ company_type_id integer NOT NULL,
+ note text
);
CREATE TABLE movie_info (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- info_type_id integer NOT NULL,
- info text NOT NULL,
- note text
+ id integer NOT NULL PRIMARY KEY,
+ movie_id integer NOT NULL,
+ info_type_id integer NOT NULL,
+ info text NOT NULL,
+ note text
);
CREATE TABLE movie_info_idx (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- info_type_id integer NOT NULL,
- info text NOT NULL,
- note text
+ id integer NOT NULL PRIMARY KEY,
+ movie_id integer NOT NULL,
+ info_type_id integer NOT NULL,
+ info text NOT NULL,
+ note text
);
CREATE TABLE movie_keyword (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- keyword_id integer NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ movie_id integer NOT NULL,
+ keyword_id integer NOT NULL
);
CREATE TABLE movie_link (
- id integer NOT NULL PRIMARY KEY,
- movie_id integer NOT NULL,
- linked_movie_id integer NOT NULL,
- link_type_id integer NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ movie_id integer NOT NULL,
+ linked_movie_id integer NOT NULL,
+ link_type_id integer NOT NULL
);
CREATE TABLE name (
- id integer NOT NULL PRIMARY KEY,
- name text NOT NULL,
- imdb_index character varying(12),
- imdb_id integer,
- gender character varying(1),
- name_pcode_cf character varying(5),
- name_pcode_nf character varying(5),
- surname_pcode character varying(5),
- md5sum character varying(32)
+ id integer NOT NULL PRIMARY KEY,
+ name text NOT NULL,
+ imdb_index character varying(12),
+ imdb_id integer,
+ gender character varying(1),
+ name_pcode_cf character varying(5),
+ name_pcode_nf character varying(5),
+ surname_pcode character varying(5),
+ md5sum character varying(32)
);
CREATE TABLE person_info (
- id integer NOT NULL PRIMARY KEY,
- person_id integer NOT NULL,
- info_type_id integer NOT NULL,
- info text NOT NULL,
- note text
+ id integer NOT NULL PRIMARY KEY,
+ person_id integer NOT NULL,
+ info_type_id integer NOT NULL,
+ info text NOT NULL,
+ note text
);
CREATE TABLE role_type (
- id integer NOT NULL PRIMARY KEY,
- role character varying(32) NOT NULL
+ id integer NOT NULL PRIMARY KEY,
+ role character varying(32) NOT NULL
);
CREATE TABLE title (
- id integer NOT NULL PRIMARY KEY,
- title text NOT NULL,
- imdb_index character varying(12),
- kind_id integer NOT NULL,
- production_year integer,
- imdb_id integer,
- phonetic_code character varying(5),
- episode_of_id integer,
- season_nr integer,
- episode_nr integer,
- series_years character varying(49),
- md5sum character varying(32)
+ id integer NOT NULL PRIMARY KEY,
+ title text NOT NULL,
+ imdb_index character varying(12),
+ kind_id integer NOT NULL,
+ production_year integer,
+ imdb_id integer,
+ phonetic_code character varying(5),
+ episode_of_id integer,
+ season_nr integer,
+ episode_nr integer,
+ series_years character varying(49),
+ md5sum character varying(32)
);
create index company_id_movie_companies on movie_companies(company_id);
diff --git a/sql/join-order-benchmark/stats.sql b/sql/join-order-benchmark/stats.sql
deleted file mode 100644
index d806e01c..00000000
--- a/sql/join-order-benchmark/stats.sql
+++ /dev/null
@@ -1,21 +0,0 @@
-select (select count(*) from aka_name),
- (select count(*) from aka_title),
- (select count(*) from cast_info),
- (select count(*) from char_name),
- (select count(*) from comp_cast_type),
- (select count(*) from company_name),
- (select count(*) from company_type),
- (select count(*) from complete_cast),
- (select count(*) from info_type),
- (select count(*) from keyword),
- (select count(*) from kind_type),
- (select count(*) from link_type),
- (select count(*) from movie_companies),
- (select count(*) from movie_info),
- (select count(*) from movie_info_idx),
- (select count(*) from movie_keyword),
- (select count(*) from movie_link),
- (select count(*) from name),
- (select count(*) from person_info),
- (select count(*) from role_type),
- (select count(*) from title);
\ No newline at end of file
diff --git a/sql/subqueries/create.sql b/sql/subqueries/create.sql
index 286f32dc..0583f3ce 100644
--- a/sql/subqueries/create.sql
+++ b/sql/subqueries/create.sql
@@ -1,23 +1,23 @@
CREATE TABLE t1000000
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
-CREATE INDEX t1000000_1_idx ON t1000000 (c_int);
-CREATE INDEX t1000000_2_idx ON t1000000 (c_int, c_bool);
-CREATE INDEX t1000000_3_idx ON t1000000 (c_int, c_text);
-CREATE INDEX t1000000_4_idx ON t1000000 (c_int, c_varchar);
-CREATE INDEX t1000000_5_idx ON t1000000 (c_float, c_text, c_varchar);
-CREATE INDEX t1000000_6_idx ON t1000000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t1000000_7_idx ON t1000000 (c_float, c_real, c_money);
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
+CREATE INDEX t1000000_1_idx ON t1000000 (c_int ASC);
+CREATE INDEX t1000000_2_idx ON t1000000 (c_int ASC, c_bool ASC);
+CREATE INDEX t1000000_3_idx ON t1000000 (c_int ASC, c_text ASC);
+CREATE INDEX t1000000_4_idx ON t1000000 (c_int ASC, c_varchar ASC);
+CREATE INDEX t1000000_5_idx ON t1000000 (c_float ASC, c_text ASC, c_varchar ASC);
+CREATE INDEX t1000000_6_idx ON t1000000 (c_float ASC, c_decimal ASC, c_varchar ASC);
+CREATE INDEX t1000000_7_idx ON t1000000 (c_float ASC, c_real ASC, c_money ASC);
CREATE TABLE t500000
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
@@ -26,46 +26,46 @@ SELECT c_int,
(c_int + 0.2):: float as c_float,
(c_int + 0.3):: real as c_real,
(c_int + 0.4) ::money as c_money FROM generate_Series(1, 50000 * $MULTIPLIER) c_int;
-CREATE INDEX t500000_1_idx ON t500000 (c_int);
-CREATE INDEX t500000_2_idx ON t500000 (c_int, c_bool);
-CREATE INDEX t500000_3_idx ON t500000 (c_int, c_text);
-CREATE INDEX t500000_4_idx ON t500000 (c_int, c_varchar);
-CREATE INDEX t500000_5_idx ON t500000 (c_float, c_text, c_varchar);
-CREATE INDEX t500000_6_idx ON t500000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t500000_7_idx ON t500000 (c_float, c_real, c_money);
+CREATE INDEX t500000_1_idx ON t500000 (c_int ASC);
+CREATE INDEX t500000_2_idx ON t500000 (c_int ASC, c_bool ASC);
+CREATE INDEX t500000_3_idx ON t500000 (c_int ASC, c_text ASC);
+CREATE INDEX t500000_4_idx ON t500000 (c_int ASC, c_varchar ASC);
+CREATE INDEX t500000_5_idx ON t500000 (c_float ASC, c_text, c_varchar ASC);
+CREATE INDEX t500000_6_idx ON t500000 (c_float ASC, c_decimal, c_varchar ASC);
+CREATE INDEX t500000_7_idx ON t500000 (c_float ASC, c_real ASC, c_money ASC);
CREATE TABLE t50000
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
-CREATE INDEX t50000_1_idx ON t50000 (c_int);
-CREATE INDEX t50000_2_idx ON t50000 (c_int, c_bool);
-CREATE INDEX t50000_3_idx ON t50000 (c_int, c_text);
-CREATE INDEX t50000_4_idx ON t50000 (c_int, c_varchar);
-CREATE INDEX t50000_5_idx ON t50000 (c_float, c_text, c_varchar);
-CREATE INDEX t50000_6_idx ON t50000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t50000_7_idx ON t50000 (c_float, c_real, c_money);
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
+CREATE INDEX t50000_1_idx ON t50000 (c_int ASC);
+CREATE INDEX t50000_2_idx ON t50000 (c_int ASC, c_bool ASC);
+CREATE INDEX t50000_3_idx ON t50000 (c_int ASC, c_text ASC);
+CREATE INDEX t50000_4_idx ON t50000 (c_int ASC, c_varchar ASC);
+CREATE INDEX t50000_5_idx ON t50000 (c_float ASC, c_text ASC, c_varchar ASC);
+CREATE INDEX t50000_6_idx ON t50000 (c_float ASC, c_decimal ASC, c_varchar ASC);
+CREATE INDEX t50000_7_idx ON t50000 (c_float ASC, c_real ASC, c_money ASC);
CREATE TABLE t100
- WITH (colocation = true) AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
-CREATE INDEX t100_1_idx ON t100 (c_int);
-CREATE INDEX t100_2_idx ON t100 (c_int, c_bool);
-CREATE INDEX t100_3_idx ON t100 (c_int, c_text);
-CREATE INDEX t100_4_idx ON t100 (c_int, c_varchar);
-CREATE INDEX t100_5_idx ON t100 (c_float, c_text, c_varchar);
-CREATE INDEX t100_6_idx ON t100 (c_float, c_decimal, c_varchar);
-CREATE INDEX t100_7_idx ON t100 (c_float, c_real, c_money);
\ No newline at end of file
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
+CREATE INDEX t100_1_idx ON t100 (c_int ASC);
+CREATE INDEX t100_2_idx ON t100 (c_int ASC, c_bool ASC);
+CREATE INDEX t100_3_idx ON t100 (c_int ASC, c_text ASC);
+CREATE INDEX t100_4_idx ON t100 (c_int ASC, c_varchar ASC);
+CREATE INDEX t100_5_idx ON t100 (c_float ASC, c_text ASC, c_varchar ASC);
+CREATE INDEX t100_6_idx ON t100 (c_float ASC, c_decimal ASC, c_varchar ASC);
+CREATE INDEX t100_7_idx ON t100 (c_float ASC, c_real ASC, c_money ASC);
\ No newline at end of file
diff --git a/sql/subqueries/hash.create.sql b/sql/subqueries/hash.create.sql
new file mode 100644
index 00000000..71cf5684
--- /dev/null
+++ b/sql/subqueries/hash.create.sql
@@ -0,0 +1,71 @@
+CREATE TABLE t1000000
+AS
+SELECT c_int,
+ (case when c_int % 2 = 0 then true else false end) as c_bool,
+ (c_int + 0.0001)::text as c_text,
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
+CREATE INDEX t1000000_1_idx ON t1000000 (c_int HASH);
+CREATE INDEX t1000000_2_idx ON t1000000 ((c_int, c_bool) HASH);
+CREATE INDEX t1000000_3_idx ON t1000000 ((c_int, c_text) HASH);
+CREATE INDEX t1000000_4_idx ON t1000000 ((c_int, c_varchar) HASH);
+CREATE INDEX t1000000_5_idx ON t1000000 ((c_float, c_text, c_varchar) HASH);
+CREATE INDEX t1000000_6_idx ON t1000000 ((c_float, c_decimal, c_varchar) HASH);
+CREATE INDEX t1000000_7_idx ON t1000000 ((c_float, c_real, c_money) HASH);
+
+CREATE TABLE t500000
+AS
+SELECT c_int,
+ (case when c_int % 2 = 0 then true else false end) as c_bool,
+ (c_int + 0.0001)::text as c_text,
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series(1, 50000 * $MULTIPLIER) c_int;
+CREATE INDEX t500000_1_idx ON t500000 (c_int HASH);
+CREATE INDEX t500000_2_idx ON t500000 ((c_int, c_bool) HASH);
+CREATE INDEX t500000_3_idx ON t500000 ((c_int, c_text) HASH);
+CREATE INDEX t500000_4_idx ON t500000 ((c_int, c_varchar) HASH);
+CREATE INDEX t500000_5_idx ON t500000 ((c_float, c_text, c_varchar) HASH);
+CREATE INDEX t500000_6_idx ON t500000 ((c_float, c_decimal, c_varchar) HASH);
+CREATE INDEX t500000_7_idx ON t500000 ((c_float, c_real, c_money) HASH);
+
+CREATE TABLE t50000
+AS
+SELECT c_int,
+ (case when c_int % 2 = 0 then true else false end) as c_bool,
+ (c_int + 0.0001)::text as c_text,
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
+CREATE INDEX t50000_1_idx ON t50000 (c_int HASH);
+CREATE INDEX t50000_2_idx ON t50000 ((c_int, c_bool) HASH);
+CREATE INDEX t50000_3_idx ON t50000 ((c_int, c_text) HASH);
+CREATE INDEX t50000_4_idx ON t50000 ((c_int, c_varchar) HASH);
+CREATE INDEX t50000_5_idx ON t50000 ((c_float, c_text, c_varchar) HASH);
+CREATE INDEX t50000_6_idx ON t50000 ((c_float, c_decimal, c_varchar) HASH);
+CREATE INDEX t50000_7_idx ON t50000 ((c_float, c_real, c_money) HASH);
+
+CREATE TABLE t100
+AS
+SELECT c_int,
+ (case when c_int % 2 = 0 then true else false end) as c_bool,
+ (c_int + 0.0001)::text as c_text,
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
+CREATE INDEX t100_1_idx ON t100 (c_int HASH);
+CREATE INDEX t100_2_idx ON t100 ((c_int, c_bool) HASH);
+CREATE INDEX t100_3_idx ON t100 ((c_int, c_text) HASH);
+CREATE INDEX t100_4_idx ON t100 ((c_int, c_varchar) HASH);
+CREATE INDEX t100_5_idx ON t100 ((c_float, c_text, c_varchar) HASH);
+CREATE INDEX t100_6_idx ON t100 ((c_float, c_decimal, c_varchar) HASH);
+CREATE INDEX t100_7_idx ON t100 ((c_float, c_real, c_money) HASH);
\ No newline at end of file
diff --git a/sql/subqueries/model.conf b/sql/subqueries/model.conf
new file mode 100644
index 00000000..2eb29eb8
--- /dev/null
+++ b/sql/subqueries/model.conf
@@ -0,0 +1,2 @@
+all-index-check = false
+load-catalog-tables = false
\ No newline at end of file
diff --git a/sql/subqueries/obsolete.create.sql b/sql/subqueries/obsolete.create.sql
deleted file mode 100644
index 5bea7f90..00000000
--- a/sql/subqueries/obsolete.create.sql
+++ /dev/null
@@ -1,71 +0,0 @@
-CREATE TABLE t1000000
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
-CREATE INDEX t1000000_1_idx ON t1000000 (c_int);
-CREATE INDEX t1000000_2_idx ON t1000000 (c_int, c_bool);
-CREATE INDEX t1000000_3_idx ON t1000000 (c_int, c_text);
-CREATE INDEX t1000000_4_idx ON t1000000 (c_int, c_varchar);
-CREATE INDEX t1000000_5_idx ON t1000000 (c_float, c_text, c_varchar);
-CREATE INDEX t1000000_6_idx ON t1000000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t1000000_7_idx ON t1000000 (c_float, c_real, c_money);
-
-CREATE TABLE t500000
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 50000 * $MULTIPLIER) c_int;
-CREATE INDEX t500000_1_idx ON t500000 (c_int);
-CREATE INDEX t500000_2_idx ON t500000 (c_int, c_bool);
-CREATE INDEX t500000_3_idx ON t500000 (c_int, c_text);
-CREATE INDEX t500000_4_idx ON t500000 (c_int, c_varchar);
-CREATE INDEX t500000_5_idx ON t500000 (c_float, c_text, c_varchar);
-CREATE INDEX t500000_6_idx ON t500000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t500000_7_idx ON t500000 (c_float, c_real, c_money);
-
-CREATE TABLE t50000
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
-CREATE INDEX t50000_1_idx ON t50000 (c_int);
-CREATE INDEX t50000_2_idx ON t50000 (c_int, c_bool);
-CREATE INDEX t50000_3_idx ON t50000 (c_int, c_text);
-CREATE INDEX t50000_4_idx ON t50000 (c_int, c_varchar);
-CREATE INDEX t50000_5_idx ON t50000 (c_float, c_text, c_varchar);
-CREATE INDEX t50000_6_idx ON t50000 (c_float, c_decimal, c_varchar);
-CREATE INDEX t50000_7_idx ON t50000 (c_float, c_real, c_money);
-
-CREATE TABLE t100
- WITH (colocated = true) AS
-SELECT c_int,
- (case when c_int % 2 = 0 then true else false end) as c_bool,
- (c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
-CREATE INDEX t100_1_idx ON t100 (c_int);
-CREATE INDEX t100_2_idx ON t100 (c_int, c_bool);
-CREATE INDEX t100_3_idx ON t100 (c_int, c_text);
-CREATE INDEX t100_4_idx ON t100 (c_int, c_varchar);
-CREATE INDEX t100_5_idx ON t100 (c_float, c_text, c_varchar);
-CREATE INDEX t100_6_idx ON t100 (c_float, c_decimal, c_varchar);
-CREATE INDEX t100_7_idx ON t100 (c_float, c_real, c_money);
\ No newline at end of file
diff --git a/sql/subqueries/postgres.create.sql b/sql/subqueries/postgres.create.sql
index ef0a3b7e..eb4d4f15 100644
--- a/sql/subqueries/postgres.create.sql
+++ b/sql/subqueries/postgres.create.sql
@@ -1,13 +1,13 @@
CREATE TABLE t1000000
- AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series(1, 100000 * $MULTIPLIER) c_int;
CREATE INDEX t1000000_1_idx ON t1000000 (c_int);
CREATE INDEX t1000000_2_idx ON t1000000 (c_int, c_bool);
CREATE INDEX t1000000_3_idx ON t1000000 (c_int, c_text);
@@ -17,7 +17,7 @@ CREATE INDEX t1000000_6_idx ON t1000000 (c_float, c_decimal, c_varchar);
CREATE INDEX t1000000_7_idx ON t1000000 (c_float, c_real, c_money);
CREATE TABLE t500000
- AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
@@ -35,15 +35,15 @@ CREATE INDEX t500000_6_idx ON t500000 (c_float, c_decimal, c_varchar);
CREATE INDEX t500000_7_idx ON t500000 (c_float, c_real, c_money);
CREATE TABLE t50000
- AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 5000 * $MULTIPLIER) c_int;
CREATE INDEX t50000_1_idx ON t50000 (c_int);
CREATE INDEX t50000_2_idx ON t50000 (c_int, c_bool);
CREATE INDEX t50000_3_idx ON t50000 (c_int, c_text);
@@ -53,15 +53,15 @@ CREATE INDEX t50000_6_idx ON t50000 (c_float, c_decimal, c_varchar);
CREATE INDEX t50000_7_idx ON t50000 (c_float, c_real, c_money);
CREATE TABLE t100
- AS
+AS
SELECT c_int,
(case when c_int % 2 = 0 then true else false end) as c_bool,
(c_int + 0.0001)::text as c_text,
- (c_int + 0.0002):: varchar as c_varchar,
- (c_int + 0.1):: decimal as c_decimal,
- (c_int + 0.2):: float as c_float,
- (c_int + 0.3):: real as c_real,
- (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
+ (c_int + 0.0002):: varchar as c_varchar,
+ (c_int + 0.1):: decimal as c_decimal,
+ (c_int + 0.2):: float as c_float,
+ (c_int + 0.3):: real as c_real,
+ (c_int + 0.4) ::money as c_money FROM generate_Series (1, 10 * $MULTIPLIER) c_int;
CREATE INDEX t100_1_idx ON t100 (c_int);
CREATE INDEX t100_2_idx ON t100 (c_int, c_bool);
CREATE INDEX t100_3_idx ON t100 (c_int, c_text);
diff --git a/sql/subqueries/queries/subq_field.sql b/sql/subqueries/queries/subq_field.sql
index 99da98f3..8047b0fd 100644
--- a/sql/subqueries/queries/subq_field.sql
+++ b/sql/subqueries/queries/subq_field.sql
@@ -1,12 +1,14 @@
-- FULL JOINS
-- SUBSELECT more that some value from subquery
+-- tags: skip_consistency_check
+
SELECT t1000000.c_text,
(SELECT t500000.c_money
FROM t500000
FULL JOIN t50000 ON t500000.c_int = t50000.c_int
WHERE t500000.c_float < %(5000) LIMIT 1)
FROM t1000000
-WHERE t1000000.c_int < (SELECT MAX(c_real) from t50000)
+WHERE t1000000.c_int < (SELECT MAX(c_real) from t50000 as t50k)
ORDER BY t1000000.c_decimal
LIMIT 1000;
@@ -17,7 +19,7 @@ SELECT t1000000.c_text,
FULL JOIN t50000 ON t500000.c_int = t50000.c_int
WHERE t500000.c_float > %(5000) LIMIT 1)
FROM t1000000
-WHERE t1000000.c_real > (SELECT AVG(c_real) from t50000)
+WHERE t1000000.c_real > (SELECT AVG(c_real) from t50000 as t50k)
ORDER BY t1000000.c_decimal;
-- SUBSELECT in range of values
@@ -28,20 +30,7 @@ SELECT t1000000.c_text,
WHERE t500000.c_float < %(5000) LIMIT 1)
FROM t1000000
WHERE t1000000.c_real in
- (SELECT t50000.c_real from t50000 where t50000.c_real < %(100))
-ORDER BY t1000000.c_decimal
-LIMIT 1000;
-
-
--- INNER JOINS
--- SUBSELECT more that some value from subquery
-SELECT t1000000.c_text,
- (SELECT t500000.c_money
- FROM t500000
- INNER JOIN t50000 ON t500000.c_int = t50000.c_int
- WHERE t500000.c_float < %(5000) LIMIT 1)
-FROM t1000000
-WHERE t1000000.c_real < (SELECT MAX(c_real) from t50000)
+ (SELECT t50k.c_real from t50000 as t50k where t50k.c_real < %(100))
ORDER BY t1000000.c_decimal
LIMIT 1000;
@@ -52,21 +41,9 @@ SELECT t1000000.c_text,
INNER JOIN t50000 ON t500000.c_int = t50000.c_int
WHERE t500000.c_float > %(5000) LIMIT 1)
FROM t1000000
-WHERE t1000000.c_real > (SELECT AVG(c_real) from t50000)
+WHERE t1000000.c_real > (SELECT AVG(c_real) from t50000 as t50k)
ORDER BY t1000000.c_decimal;
--- SUBSELECT in range of values
-SELECT t1000000.c_text,
- (SELECT t500000.c_money
- FROM t500000
- INNER JOIN t50000 ON t500000.c_int = t50000.c_int
- WHERE t500000.c_float < %(5000) LIMIT 1)
-FROM t1000000
-WHERE t1000000.c_real in
- (SELECT t50000.c_real from t50000 where t50000.c_real < %(100))
-ORDER BY t1000000.c_decimal
-LIMIT 1000;
-
-- LEFT JOINS
-- SUBSELECT more that some value from subquery
SELECT t1000000.c_text,
@@ -75,20 +52,10 @@ SELECT t1000000.c_text,
INNER JOIN t50000 ON t500000.c_int = t50000.c_int
WHERE t500000.c_float < %(5000) LIMIT 1)
FROM t1000000
-WHERE t1000000.c_real < (SELECT MAX(c_real) from t50000)
+WHERE t1000000.c_real < (SELECT MAX(c_real) from t50000 as t50k)
ORDER BY t1000000.c_decimal
LIMIT 1000;
--- SUBSELECT less that some value from subquery
-SELECT t1000000.c_text,
- (SELECT t500000.c_money
- FROM t500000
- INNER JOIN t50000 ON t500000.c_int = t50000.c_int
- WHERE t500000.c_float > %(5000) LIMIT 1)
-FROM t1000000
-WHERE t1000000.c_real > (SELECT AVG(c_real) from t50000)
-ORDER BY t1000000.c_decimal;
-
-- SUBSELECT in range of values
SELECT t1000000.c_text,
(SELECT t500000.c_money
@@ -97,6 +64,6 @@ SELECT t1000000.c_text,
WHERE t500000.c_float < %(5000) LIMIT 1)
FROM t1000000
WHERE t1000000.c_real in
- (SELECT t50000.c_real from t50000 where t50000.c_real < %(100))
+ (SELECT t50k.c_real from t50000 as t50k where t50k.c_real < %(100))
ORDER BY t1000000.c_decimal
LIMIT 1000;
\ No newline at end of file
diff --git a/sql/subqueries/queries/subq_where.sql b/sql/subqueries/queries/subq_where.sql
index 5839f82e..f6eebce3 100644
--- a/sql/subqueries/queries/subq_where.sql
+++ b/sql/subqueries/queries/subq_where.sql
@@ -4,7 +4,7 @@ SELECT *
FROM t1000000
FULL JOIN t500000 ON t1000000.c_int = t500000.c_int
FULL JOIN t50000 ON t1000000.c_int = t50000.c_int
-WHERE t1000000.c_int < (SELECT MAX(c_real) from t50000)
+WHERE t1000000.c_int < (SELECT MAX(c_real) from t50000 as t50k)
ORDER BY t1000000.c_int
LIMIT 1000;
@@ -13,7 +13,7 @@ SELECT *
FROM t1000000
FULL JOIN t500000 ON t1000000.c_int = t500000.c_int
FULL JOIN t50000 ON t1000000.c_int = t50000.c_int
-WHERE t1000000.c_int > (SELECT AVG(c_real) from t50000)
+WHERE t1000000.c_int > (SELECT AVG(c_real) from t50000 as t50k)
ORDER BY t1000000.c_int;
-- WHERE SUBSELECT in range of values
@@ -22,7 +22,7 @@ FROM t1000000
FULL JOIN t500000 ON t1000000.c_int = t500000.c_int
FULL join t50000 ON t1000000.c_int = t50000.c_int
WHERE t1000000.c_int in
- (SELECT t50000.c_int from t50000 where t50000.c_int < %(100))
+ (SELECT t50000.c_int from t50000 as t50k where t50000.c_int < %(100))
ORDER BY t1000000.c_int
LIMIT 1000;
@@ -32,7 +32,7 @@ SELECT *
FROM t1000000
INNER JOIN t500000 ON t1000000.c_int = t500000.c_int
INNER JOIN t50000 ON t1000000.c_int = t50000.c_int
-WHERE t1000000.c_int < (SELECT MAX(c_real) from t50000)
+WHERE t1000000.c_int < (SELECT MAX(c_real) from t50000 as t50k)
ORDER BY t1000000.c_int
LIMIT 1000;
@@ -41,7 +41,7 @@ SELECT *
FROM t1000000
INNER JOIN t500000 ON t1000000.c_int = t500000.c_int
INNER JOIN t50000 ON t1000000.c_int = t50000.c_int
-WHERE t1000000.c_int > (SELECT AVG(c_real) from t50000)
+WHERE t1000000.c_int > (SELECT AVG(c_real) from t50000 as t50k)
ORDER BY t1000000.c_float;
-- WHERE SUBSELECT in range of values
@@ -50,7 +50,7 @@ FROM t1000000
INNER JOIN t500000 ON t1000000.c_int = t500000.c_int
INNER join t50000 ON t1000000.c_int = t50000.c_int
WHERE t1000000.c_int in
- (SELECT t50000.c_int from t50000 where t50000.c_real < %(100))
+ (SELECT t50000.c_int from t50000 as t50k where t50k.c_real < %(100))
ORDER BY t1000000.c_float
LIMIT 1000;
@@ -60,7 +60,7 @@ SELECT *
FROM t1000000
LEFT OUTER JOIN t500000 ON t1000000.c_int = t500000.c_int
LEFT OUTER JOIN t50000 ON t1000000.c_int = t50000.c_int
-WHERE t1000000.c_int < (SELECT MAX(c_real) from t50000)
+WHERE t1000000.c_int < (SELECT MAX(c_real) from t50000 as t50k)
ORDER BY t1000000.c_float
LIMIT 1000;
@@ -69,7 +69,7 @@ SELECT *
FROM t1000000
LEFT OUTER JOIN t500000 ON t1000000.c_int = t500000.c_int
LEFT OUTER JOIN t50000 ON t1000000.c_int = t50000.c_int
-WHERE t1000000.c_int > (SELECT AVG(c_real) from t50000)
+WHERE t1000000.c_int > (SELECT AVG(c_real) from t50000 as t50k)
ORDER BY t1000000.c_int;
-- WHERE SUBSELECT in range of values
@@ -78,6 +78,6 @@ FROM t1000000
LEFT OUTER JOIN t500000 ON t1000000.c_int = t500000.c_int
LEFT OUTER JOIN t50000 ON t1000000.c_int = t50000.c_int
WHERE t1000000.c_int in
- (SELECT t50000.c_int from t50000 where t50000.c_real < %(100))
+ (SELECT t50k.c_int from t50000 as t50k where t50k.c_real < %(100))
ORDER BY t1000000.c_float
LIMIT 1000;
\ No newline at end of file
diff --git a/sql/subqueries/stats.sql b/sql/subqueries/stats.sql
deleted file mode 100644
index 53a0dc0f..00000000
--- a/sql/subqueries/stats.sql
+++ /dev/null
@@ -1,4 +0,0 @@
-select (select count(*) from t1000000),
- (select count(*) from t500000),
- (select count(*) from t50000),
- (select count(*) from t100);
\ No newline at end of file
diff --git a/sql/tpch/analyze.sql b/sql/tpch/analyze.sql
new file mode 100644
index 00000000..91f496c2
--- /dev/null
+++ b/sql/tpch/analyze.sql
@@ -0,0 +1,8 @@
+analyze part;
+analyze supplier;
+analyze partsupp;
+analyze customer;
+analyze orders;
+analyze lineitem;
+analyze nation;
+analyze region;
\ No newline at end of file
diff --git a/sql/tpch/create.sql b/sql/tpch/create.sql
new file mode 100644
index 00000000..befc8b4b
--- /dev/null
+++ b/sql/tpch/create.sql
@@ -0,0 +1,105 @@
+create table region
+(
+ r_regionkey integer primary key,
+ r_name varchar(25) not null,
+ r_comment varchar(152)
+);
+
+create table nation
+(
+ n_nationkey integer primary key,
+ n_name varchar(25) not null,
+ n_regionkey integer references region (r_regionkey),
+ n_comment varchar(152)
+);
+
+create table part
+(
+ p_partkey integer primary key,
+ p_name varchar(55),
+ p_mfgr varchar(25),
+ p_brand varchar(10),
+ p_type varchar(25),
+ p_size integer,
+ p_container varchar(10),
+ p_retailprice decimal(15, 2),
+ p_comment varchar(23)
+);
+
+create table supplier
+(
+ s_suppkey integer primary key,
+ s_name varchar(25),
+ s_address varchar(40),
+ s_nationkey integer references nation (n_nationkey),
+ s_phone varchar(15),
+ s_acctbal decimal(15, 2),
+ s_comment varchar(101)
+);
+
+create table partsupp
+(
+ ps_partkey integer references part (p_partkey),
+ ps_suppkey integer references supplier (s_suppkey),
+ ps_availqty integer,
+ ps_supplycost decimal(15, 2),
+ ps_comment varchar(199),
+ primary key (ps_partkey ASC, ps_suppkey ASC)
+);
+
+create table customer
+(
+ c_custkey integer primary key,
+ c_name varchar(25),
+ c_address varchar(40),
+ c_nationkey integer references nation (n_nationkey),
+ c_phone varchar(15),
+ c_acctbal decimal(15, 2),
+ c_mktsegment varchar(10),
+ c_comment varchar(117)
+);
+
+create table orders
+(
+ o_orderkey integer primary key,
+ o_custkey integer references customer (c_custkey),
+ o_orderstatus char(1),
+ o_totalprice decimal(15, 2),
+ o_orderdate date,
+ o_orderpriority char(15),
+ o_clerk char(15),
+ o_shippriority integer,
+ o_comment varchar(79)
+);
+
+create table lineitem
+(
+ l_orderkey integer references orders (o_orderkey),
+ l_partkey integer references part (p_partkey),
+ l_suppkey integer references supplier (s_suppkey),
+ l_linenumber integer,
+ l_quantity decimal(15, 2),
+ l_extendedprice decimal(15, 2),
+ l_discount decimal(15, 2),
+ l_tax decimal(15, 2),
+ l_returnflag char(1),
+ l_linestatus char(1),
+ l_shipdate date,
+ l_commitdate date,
+ l_receiptdate date,
+ l_shipinstruct char(25),
+ l_shipmode char(10),
+ l_comment varchar(44),
+ primary key (l_orderkey ASC, l_suppkey ASC, l_partkey ASC, l_linenumber ASC)
+);
+
+create index idx_supplier_nation_key on supplier (s_nationkey ASC);
+create index idx_partsupp_partkey on partsupp (ps_partkey ASC);
+create index idx_partsupp_suppkey on partsupp (ps_suppkey ASC);
+create index idx_customer_nationkey on customer (c_nationkey ASC);
+create index idx_orders_custkey on orders (o_custkey ASC);
+create index idx_orders_orderdate on orders (o_orderdate ASC);
+create index idx_lineitem_orderkey on lineitem (l_orderkey ASC);
+create index idx_lineitem_part_supp on lineitem (l_partkey ASC, l_suppkey ASC);
+create index idx_lineitem_shipdate on lineitem (l_shipdate ASC, l_discount ASC, l_quantity ASC);
+create index idx_nation_regionkey on nation (n_regionkey ASC);
\ No newline at end of file
diff --git a/sql/tpch/drop.sql b/sql/tpch/drop.sql
new file mode 100644
index 00000000..a25283af
--- /dev/null
+++ b/sql/tpch/drop.sql
@@ -0,0 +1,8 @@
+drop table if exists part cascade;
+drop table if exists supplier cascade;
+drop table if exists partsupp cascade;
+drop table if exists customer cascade;
+drop table if exists orders cascade;
+drop table if exists lineitem cascade;
+drop table if exists nation cascade;
+drop table if exists region cascade;
diff --git a/sql/tpch/hash.create.sql b/sql/tpch/hash.create.sql
new file mode 100644
index 00000000..9d86146c
--- /dev/null
+++ b/sql/tpch/hash.create.sql
@@ -0,0 +1,105 @@
+create table region
+(
+ r_regionkey integer primary key,
+ r_name varchar(25) not null,
+ r_comment varchar(152)
+);
+
+create table nation
+(
+ n_nationkey integer primary key,
+ n_name varchar(25) not null,
+ n_regionkey integer references region (r_regionkey),
+ n_comment varchar(152)
+);
+
+create table part
+(
+ p_partkey integer primary key,
+ p_name varchar(55),
+ p_mfgr varchar(25),
+ p_brand varchar(10),
+ p_type varchar(25),
+ p_size integer,
+ p_container varchar(10),
+ p_retailprice decimal(15, 2),
+ p_comment varchar(23)
+);
+
+create table supplier
+(
+ s_suppkey integer primary key,
+ s_name varchar(25),
+ s_address varchar(40),
+ s_nationkey integer references nation (n_nationkey),
+ s_phone varchar(15),
+ s_acctbal decimal(15, 2),
+ s_comment varchar(101)
+);
+
+create table partsupp
+(
+ ps_partkey integer references part (p_partkey),
+ ps_suppkey integer references supplier (s_suppkey),
+ ps_availqty integer,
+ ps_supplycost decimal(15, 2),
+ ps_comment varchar(199),
+ primary key ((ps_partkey, ps_suppkey) HASH)
+);
+
+create table customer
+(
+ c_custkey integer primary key,
+ c_name varchar(25),
+ c_address varchar(40),
+ c_nationkey integer references nation (n_nationkey),
+ c_phone varchar(15),
+ c_acctbal decimal(15, 2),
+ c_mktsegment varchar(10),
+ c_comment varchar(117)
+);
+
+create table orders
+(
+ o_orderkey integer primary key,
+ o_custkey integer references customer (c_custkey),
+ o_orderstatus char(1),
+ o_totalprice decimal(15, 2),
+ o_orderdate date,
+ o_orderpriority char(15),
+ o_clerk char(15),
+ o_shippriority integer,
+ o_comment varchar(79)
+);
+
+create table lineitem
+(
+ l_orderkey integer references orders (o_orderkey),
+ l_partkey integer references part (p_partkey),
+ l_suppkey integer references supplier (s_suppkey),
+ l_linenumber integer,
+ l_quantity decimal(15, 2),
+ l_extendedprice decimal(15, 2),
+ l_discount decimal(15, 2),
+ l_tax decimal(15, 2),
+ l_returnflag char(1),
+ l_linestatus char(1),
+ l_shipdate date,
+ l_commitdate date,
+ l_receiptdate date,
+ l_shipinstruct char(25),
+ l_shipmode char(10),
+ l_comment varchar(44),
+ primary key ((l_orderkey, l_suppkey, l_partkey, l_linenumber) HASH)
+);
+
+create index idx_supplier_nation_key on supplier (s_nationkey HASH);
+create index idx_partsupp_partkey on partsupp (ps_partkey HASH);
+create index idx_partsupp_suppkey on partsupp (ps_suppkey HASH);
+create index idx_customer_nationkey on customer (c_nationkey HASH);
+create index idx_orders_custkey on orders (o_custkey HASH);
+create index idx_orders_orderdate on orders (o_orderdate HASH);
+create index idx_lineitem_orderkey on lineitem (l_orderkey HASH);
+create index idx_lineitem_part_supp on lineitem ((l_partkey, l_suppkey) HASH);
+create index idx_lineitem_shipdate on lineitem ((l_shipdate, l_discount, l_quantity) HASH);
+create index idx_nation_regionkey on nation (n_regionkey HASH);
\ No newline at end of file
diff --git a/sql/tpch/import.sql b/sql/tpch/import.sql
new file mode 100644
index 00000000..8584f4fe
--- /dev/null
+++ b/sql/tpch/import.sql
@@ -0,0 +1,8 @@
+COPY region FROM '$DATA_PATH/region.tbl' with (delimiter '|', FORMAT csv, NULL 'NULL');
+COPY nation FROM '$DATA_PATH/nation.tbl' with (delimiter '|', FORMAT csv, NULL 'NULL');
+COPY part FROM '$DATA_PATH/part.tbl' with (delimiter '|', FORMAT csv, NULL 'NULL');
+COPY supplier FROM '$DATA_PATH/supplier.tbl' with (delimiter '|', FORMAT csv, NULL 'NULL');
+COPY partsupp FROM '$DATA_PATH/partsupp.tbl' with (delimiter '|', FORMAT csv, NULL 'NULL');
+COPY customer FROM '$DATA_PATH/customer.tbl' with (delimiter '|', FORMAT csv, NULL 'NULL');
+COPY orders FROM '$DATA_PATH/orders.tbl' with (delimiter '|', FORMAT csv, NULL 'NULL');
+COPY lineitem FROM '$DATA_PATH/lineitem.tbl' with (delimiter '|', FORMAT csv, NULL 'NULL');
diff --git a/sql/tpch/postgres.create.sql b/sql/tpch/postgres.create.sql
new file mode 100644
index 00000000..8c8c19ed
--- /dev/null
+++ b/sql/tpch/postgres.create.sql
@@ -0,0 +1,105 @@
+create table region
+(
+ r_regionkey integer primary key,
+ r_name varchar(25) not null,
+ r_comment varchar(152)
+);
+
+create table nation
+(
+ n_nationkey integer primary key,
+ n_name varchar(25) not null,
+ n_regionkey integer references region (r_regionkey),
+ n_comment varchar(152)
+);
+
+create table part
+(
+ p_partkey integer primary key,
+ p_name varchar(55),
+ p_mfgr varchar(25),
+ p_brand varchar(10),
+ p_type varchar(25),
+ p_size integer,
+ p_container varchar(10),
+ p_retailprice decimal(15, 2),
+ p_comment varchar(23)
+);
+
+create table supplier
+(
+ s_suppkey integer primary key,
+ s_name varchar(25),
+ s_address varchar(40),
+ s_nationkey integer references nation (n_nationkey),
+ s_phone varchar(15),
+ s_acctbal decimal(15, 2),
+ s_comment varchar(101)
+);
+
+create table partsupp
+(
+ ps_partkey integer references part (p_partkey),
+ ps_suppkey integer references supplier (s_suppkey),
+ ps_availqty integer,
+ ps_supplycost decimal(15, 2),
+ ps_comment varchar(199),
+ primary key (ps_partkey, ps_suppkey)
+);
+
+create table customer
+(
+ c_custkey integer primary key,
+ c_name varchar(25),
+ c_address varchar(40),
+ c_nationkey integer references nation (n_nationkey),
+ c_phone varchar(15),
+ c_acctbal decimal(15, 2),
+ c_mktsegment varchar(10),
+ c_comment varchar(117)
+);
+
+create table orders
+(
+ o_orderkey integer primary key,
+ o_custkey integer references customer (c_custkey),
+ o_orderstatus char(1),
+ o_totalprice decimal(15, 2),
+ o_orderdate date,
+ o_orderpriority char(15),
+ o_clerk char(15),
+ o_shippriority integer,
+ o_comment varchar(79)
+);
+
+create table lineitem
+(
+ l_orderkey integer references orders (o_orderkey),
+ l_partkey integer references part (p_partkey),
+ l_suppkey integer references supplier (s_suppkey),
+ l_linenumber integer,
+ l_quantity decimal(15, 2),
+ l_extendedprice decimal(15, 2),
+ l_discount decimal(15, 2),
+ l_tax decimal(15, 2),
+ l_returnflag char(1),
+ l_linestatus char(1),
+ l_shipdate date,
+ l_commitdate date,
+ l_receiptdate date,
+ l_shipinstruct char(25),
+ l_shipmode char(10),
+ l_comment varchar(44),
+ primary key (l_orderkey, l_suppkey, l_partkey, l_linenumber)
+);
+
+create index idx_supplier_nation_key on supplier (s_nationkey);
+create index idx_partsupp_partkey on partsupp (ps_partkey);
+create index idx_partsupp_suppkey on partsupp (ps_suppkey);
+create index idx_customer_nationkey on customer (c_nationkey);
+create index idx_orders_custkey on orders (o_custkey);
+create index idx_orders_orderdate on orders (o_orderdate);
+create index idx_lineitem_orderkey on lineitem (l_orderkey);
+create index idx_lineitem_part_supp on lineitem (l_partkey, l_suppkey);
+create index idx_lineitem_shipdate on lineitem (l_shipdate, l_discount, l_quantity);
+create index idx_nation_regionkey on nation (n_regionkey);
\ No newline at end of file
diff --git a/sql/tpch/queries/q01.sql b/sql/tpch/queries/q01.sql
new file mode 100644
index 00000000..2b0ef67c
--- /dev/null
+++ b/sql/tpch/queries/q01.sql
@@ -0,0 +1,21 @@
+SELECT
+ l_returnflag,
+ l_linestatus,
+ SUM(l_quantity) AS sum_qty,
+ SUM(l_extendedprice) AS sum_base_price,
+ SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
+ SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
+ AVG(l_quantity) AS avg_qty,
+ AVG(l_extendedprice) AS avg_price,
+ AVG(l_discount) AS avg_disc,
+ COUNT(*) AS count_order
+FROM
+ lineitem
+WHERE
+ l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
+GROUP BY
+ l_returnflag,
+ l_linestatus
+ORDER BY
+ l_returnflag,
+ l_linestatus;
\ No newline at end of file
diff --git a/sql/tpch/queries/q02.sql b/sql/tpch/queries/q02.sql
new file mode 100644
index 00000000..32e26f95
--- /dev/null
+++ b/sql/tpch/queries/q02.sql
@@ -0,0 +1,35 @@
+SELECT s_acctbal,
+ s_name,
+ n_name,
+ p_partkey,
+ p_mfgr,
+ s_address,
+ s_phone,
+ s_comment
+FROM part,
+ supplier,
+ partsupp,
+ nation,
+ region
+WHERE p_partkey = ps_partkey
+ AND s_suppkey = ps_suppkey
+ AND p_size = 15
+ AND p_type LIKE '%BRASS%'
+ AND s_nationkey = n_nationkey
+ AND n_regionkey = r_regionkey
+ AND r_name = 'EUROPE'
+ AND ps_supplycost = (SELECT MIN(ps_supplycost)
+ FROM partsupp,
+ supplier,
+ nation,
+ region
+ WHERE p_partkey = ps_partkey
+ AND s_suppkey = ps_suppkey
+ AND s_nationkey = n_nationkey
+ AND n_regionkey = r_regionkey
+ AND r_name = 'EUROPE')
+ORDER BY s_acctbal DESC,
+ n_name,
+ s_name,
+ p_partkey
+LIMIT 100;
\ No newline at end of file
diff --git a/sql/tpch/queries/q03.sql b/sql/tpch/queries/q03.sql
new file mode 100644
index 00000000..b0373d0b
--- /dev/null
+++ b/sql/tpch/queries/q03.sql
@@ -0,0 +1,23 @@
+SELECT
+ l_orderkey,
+ SUM(l_extendedprice * (1 - l_discount)) AS revenue,
+ o_orderdate,
+ o_shippriority
+FROM
+ customer,
+ orders,
+ lineitem
+WHERE
+ c_mktsegment = 'BUILDING'
+ AND c_custkey = o_custkey
+ AND l_orderkey = o_orderkey
+ AND o_orderdate < DATE '1995-03-15'
+ AND l_shipdate > DATE '1995-03-15'
+GROUP BY
+ l_orderkey,
+ o_orderdate,
+ o_shippriority
+ORDER BY
+ revenue DESC,
+ o_orderdate
+LIMIT 10;
\ No newline at end of file
diff --git a/sql/tpch/queries/q04.sql b/sql/tpch/queries/q04.sql
new file mode 100644
index 00000000..f706c776
--- /dev/null
+++ b/sql/tpch/queries/q04.sql
@@ -0,0 +1,11 @@
+SELECT o_orderpriority,
+ COUNT(*) AS order_count
+FROM orders
+WHERE o_orderdate >= DATE '1993-07-01'
+ AND o_orderdate < DATE '1993-07-01' + interval '3' month
+ AND EXISTS (SELECT *
+ FROM lineitem
+ WHERE l_orderkey = o_orderkey
+ AND l_commitdate < l_receiptdate)
+GROUP BY o_orderpriority
+ORDER BY o_orderpriority;
\ No newline at end of file
diff --git a/sql/tpch/queries/q05.sql b/sql/tpch/queries/q05.sql
new file mode 100644
index 00000000..56108d09
--- /dev/null
+++ b/sql/tpch/queries/q05.sql
@@ -0,0 +1,19 @@
+SELECT n_name,
+ SUM(l_extendedprice * (1 - l_discount)) AS revenue
+FROM customer,
+ orders,
+ lineitem,
+ supplier,
+ nation,
+ region
+WHERE c_custkey = o_custkey
+ AND l_orderkey = o_orderkey
+ AND l_suppkey = s_suppkey
+ AND c_nationkey = s_nationkey
+ AND s_nationkey = n_nationkey
+ AND n_regionkey = r_regionkey
+ AND r_name = 'ASIA'
+ AND o_orderdate >= DATE '1994-01-01'
+ AND o_orderdate < DATE '1994-01-01' + interval '1' year
+GROUP BY n_name
+ORDER BY revenue DESC;
\ No newline at end of file
diff --git a/sql/tpch/queries/q06.sql b/sql/tpch/queries/q06.sql
new file mode 100644
index 00000000..59267e04
--- /dev/null
+++ b/sql/tpch/queries/q06.sql
@@ -0,0 +1,6 @@
+select sum(l_extendedprice * l_discount) as revenue
+from lineitem
+where l_shipdate >= date '1994-01-01'
+ and l_shipdate < date '1994-01-01' + interval '1' year
+ and l_discount between 0.06 - 0.01 and 0.06 + 0.01
+ and l_quantity < 24;
\ No newline at end of file
diff --git a/sql/tpch/queries/q07.sql b/sql/tpch/queries/q07.sql
new file mode 100644
index 00000000..c032a22e
--- /dev/null
+++ b/sql/tpch/queries/q07.sql
@@ -0,0 +1,30 @@
+select supp_nation,
+ cust_nation,
+ l_year,
+ sum(volume) as revenue
+from (select n1.n_name as supp_nation,
+ n2.n_name as cust_nation,
+ extract(year from l_shipdate) as l_year,
+ l_extendedprice * (1 - l_discount) as volume
+ from supplier,
+ lineitem,
+ orders,
+ customer,
+ nation n1,
+ nation n2
+ where s_suppkey = l_suppkey
+ and o_orderkey = l_orderkey
+ and c_custkey = o_custkey
+ and s_nationkey = n1.n_nationkey
+ and c_nationkey = n2.n_nationkey
+ and (
+ (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
+ or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
+ )
+ and l_shipdate between date '1995-01-01' and date '1996-12-31') as shipping
+group by supp_nation,
+ cust_nation,
+ l_year
+order by supp_nation,
+ cust_nation,
+ l_year;
\ No newline at end of file
diff --git a/sql/tpch/queries/q08.sql b/sql/tpch/queries/q08.sql
new file mode 100644
index 00000000..86282ccf
--- /dev/null
+++ b/sql/tpch/queries/q08.sql
@@ -0,0 +1,29 @@
+select o_year,
+ sum(case
+ when nation = 'BRAZIL'
+ then volume
+ else 0
+ end) / sum(volume) as mkt_share
+from (select extract(year from o_orderdate) as o_year,
+ l_extendedprice * (1 - l_discount) as volume,
+ n2.n_name as nation
+ from part,
+ supplier,
+ lineitem,
+ orders,
+ customer,
+ nation n1,
+ nation n2,
+ region
+ where p_partkey = l_partkey
+ and s_suppkey = l_suppkey
+ and l_orderkey = o_orderkey
+ and o_custkey = c_custkey
+ and c_nationkey = n1.n_nationkey
+ and n1.n_regionkey = r_regionkey
+ and r_name = 'AMERICA'
+ and s_nationkey = n2.n_nationkey
+ and o_orderdate between date '1995-01-01' and date '1996-12-31'
+ and p_type = 'ECONOMY ANODIZED STEEL') as all_nations
+group by o_year
+order by o_year;
\ No newline at end of file
diff --git a/sql/tpch/queries/q09.sql b/sql/tpch/queries/q09.sql
new file mode 100644
index 00000000..0f7f4ae4
--- /dev/null
+++ b/sql/tpch/queries/q09.sql
@@ -0,0 +1,23 @@
+select nation,
+ o_year,
+ sum(amount) as sum_profit
+from (select n_name as nation,
+ extract(year from o_orderdate) as o_year,
+ l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+ from part,
+ supplier,
+ lineitem,
+ partsupp,
+ orders,
+ nation
+ where s_suppkey = l_suppkey
+ and ps_suppkey = l_suppkey
+ and ps_partkey = l_partkey
+ and p_partkey = l_partkey
+ and o_orderkey = l_orderkey
+ and s_nationkey = n_nationkey
+ and p_name like '%green%') as profit
+group by nation,
+ o_year
+order by nation,
+ o_year desc;
\ No newline at end of file
diff --git a/sql/tpch/queries/q10.sql b/sql/tpch/queries/q10.sql
new file mode 100644
index 00000000..4cf0da09
--- /dev/null
+++ b/sql/tpch/queries/q10.sql
@@ -0,0 +1,27 @@
+select c_custkey,
+ c_name,
+ sum(l_extendedprice * (1 - l_discount)) as revenue,
+ c_acctbal,
+ n_name,
+ c_address,
+ c_phone,
+ c_comment
+from customer,
+ orders,
+ lineitem,
+ nation
+where c_custkey = o_custkey
+ and l_orderkey = o_orderkey
+ and o_orderdate >= date '1993-10-01'
+ and o_orderdate < date '1993-10-01' + interval '3' month
+ and l_returnflag = 'R'
+ and c_nationkey = n_nationkey
+group by c_custkey,
+ c_name,
+ c_acctbal,
+ c_phone,
+ n_name,
+ c_address,
+ c_comment
+order by revenue desc
+LIMIT 20;
\ No newline at end of file
diff --git a/sql/tpch/queries/q11.sql b/sql/tpch/queries/q11.sql
new file mode 100644
index 00000000..ce2d2d93
--- /dev/null
+++ b/sql/tpch/queries/q11.sql
@@ -0,0 +1,17 @@
+select ps_partkey,
+ sum(ps_supplycost * ps_availqty) as value
+from partsupp,
+ supplier,
+ nation
+where ps_suppkey = s_suppkey
+ and s_nationkey = n_nationkey
+ and n_name = 'GERMANY'
+group by ps_partkey
+having sum(ps_supplycost * ps_availqty) > (select sum(ps_supplycost * ps_availqty) * 0.0001
+ from partsupp,
+ supplier,
+ nation
+ where ps_suppkey = s_suppkey
+ and s_nationkey = n_nationkey
+ and n_name = 'GERMANY')
+order by value desc;
\ No newline at end of file
diff --git a/sql/tpch/queries/q12.sql b/sql/tpch/queries/q12.sql
new file mode 100644
index 00000000..78cb09da
--- /dev/null
+++ b/sql/tpch/queries/q12.sql
@@ -0,0 +1,23 @@
+select l_shipmode,
+ sum(case
+ when o_orderpriority = '1-URGENT'
+ or o_orderpriority = '2-HIGH'
+ then 1
+ else 0
+ end) as high_line_count,
+ sum(case
+ when o_orderpriority <> '1-URGENT'
+ and o_orderpriority <> '2-HIGH'
+ then 1
+ else 0
+ end) as low_line_count
+from orders,
+ lineitem
+where o_orderkey = l_orderkey
+ and l_shipmode in ('MAIL', 'SHIP')
+ and l_commitdate < l_receiptdate
+ and l_shipdate < l_commitdate
+ and l_receiptdate >= date '1994-01-01'
+ and l_receiptdate < date '1994-01-01' + interval '1' year
+group by l_shipmode
+order by l_shipmode;
\ No newline at end of file
diff --git a/sql/tpch/queries/q13.sql b/sql/tpch/queries/q13.sql
new file mode 100644
index 00000000..b9bf8fba
--- /dev/null
+++ b/sql/tpch/queries/q13.sql
@@ -0,0 +1,12 @@
+select c_count,
+ count(*) as custdist
+from (select c_custkey,
+ count(o_orderkey)
+ from customer
+ left outer join orders on
+ c_custkey = o_custkey
+ and o_comment not like '%special%requests%'
+ group by c_custkey) as c_orders (c_custkey, c_count)
+group by c_count
+order by custdist desc,
+ c_count desc;
\ No newline at end of file
diff --git a/sql/tpch/queries/q14.sql b/sql/tpch/queries/q14.sql
new file mode 100644
index 00000000..820b4550
--- /dev/null
+++ b/sql/tpch/queries/q14.sql
@@ -0,0 +1,9 @@
+SELECT 100.00 * SUM(CASE
+ WHEN p_type LIKE 'PROMO%' THEN l_extendedprice * (1 - l_discount)
+ ELSE 0
+ END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
+FROM lineitem,
+ part
+WHERE l_partkey = p_partkey
+ AND l_shipdate >= DATE '1995-09-01'
+ AND l_shipdate < DATE '1995-09-01' + interval '1' month;
\ No newline at end of file
diff --git a/sql/tpch/queries/q15.sql b/sql/tpch/queries/q15.sql
new file mode 100644
index 00000000..44a10102
--- /dev/null
+++ b/sql/tpch/queries/q15.sql
@@ -0,0 +1,17 @@
+WITH revenue AS (SELECT l_suppkey AS supplier_no,
+ SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
+ FROM lineitem
+ WHERE l_shipdate >= DATE '1996-01-01'
+ AND l_shipdate < DATE '1996-01-01' + interval '3' month
+ GROUP BY l_suppkey)
+SELECT s_suppkey,
+ s_name,
+ s_address,
+ s_phone,
+ total_revenue
+FROM supplier,
+ revenue
+WHERE s_suppkey = supplier_no
+ AND total_revenue = (SELECT MAX(total_revenue)
+ FROM revenue)
+ORDER BY s_suppkey;
\ No newline at end of file
diff --git a/sql/tpch/queries/q16.sql b/sql/tpch/queries/q16.sql
new file mode 100644
index 00000000..304416fe
--- /dev/null
+++ b/sql/tpch/queries/q16.sql
@@ -0,0 +1,20 @@
+SELECT p_brand,
+ p_type,
+ p_size,
+ COUNT(DISTINCT ps_suppkey) AS supplier_cnt
+FROM partsupp,
+ part
+WHERE p_partkey = ps_partkey
+ AND p_brand <> 'Brand#45'
+ AND p_type NOT LIKE 'MEDIUM POLISHED%'
+ AND p_size IN (49, 14, 23, 45, 19, 3, 36, 9)
+ AND ps_suppkey NOT IN (SELECT s_suppkey
+ FROM supplier
+ WHERE s_comment LIKE '%Customer%Complaints%')
+GROUP BY p_brand,
+ p_type,
+ p_size
+ORDER BY supplier_cnt DESC,
+ p_brand,
+ p_type,
+ p_size;
\ No newline at end of file
diff --git a/sql/tpch/queries/q17.sql b/sql/tpch/queries/q17.sql
new file mode 100644
index 00000000..82a7f04a
--- /dev/null
+++ b/sql/tpch/queries/q17.sql
@@ -0,0 +1,9 @@
+SELECT SUM(l_extendedprice) / 7.0 AS avg_yearly
+FROM lineitem,
+ part
+WHERE p_partkey = l_partkey
+ AND p_brand = 'Brand#23'
+ AND p_container = 'MED BOX'
+ AND l_quantity < (SELECT 0.2 * AVG(l_quantity)
+ FROM lineitem
+ WHERE l_partkey = p_partkey);
\ No newline at end of file
diff --git a/sql/tpch/queries/q18.sql b/sql/tpch/queries/q18.sql
new file mode 100644
index 00000000..456e8a31
--- /dev/null
+++ b/sql/tpch/queries/q18.sql
@@ -0,0 +1,23 @@
+SELECT c_name,
+ c_custkey,
+ o_orderkey,
+ o_orderdate,
+ o_totalprice,
+ SUM(l_quantity)
+FROM customer,
+ orders,
+ lineitem
+WHERE o_orderkey IN (SELECT l_orderkey
+ FROM lineitem
+ GROUP BY l_orderkey
+ HAVING SUM(l_quantity) > 300)
+ AND c_custkey = o_custkey
+ AND o_orderkey = l_orderkey
+GROUP BY c_name,
+ c_custkey,
+ o_orderkey,
+ o_orderdate,
+ o_totalprice
+ORDER BY o_totalprice DESC,
+ o_orderdate
+LIMIT 100;
\ No newline at end of file
diff --git a/sql/tpch/queries/q19.sql b/sql/tpch/queries/q19.sql
new file mode 100644
index 00000000..333322db
--- /dev/null
+++ b/sql/tpch/queries/q19.sql
@@ -0,0 +1,30 @@
+SELECT SUM(l_extendedprice * (1 - l_discount)) AS revenue
+FROM lineitem,
+ part
+WHERE (
+ p_partkey = l_partkey
+ AND p_brand = 'Brand#12'
+ AND p_container IN ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
+ AND l_quantity >= 1 AND l_quantity <= 1 + 10
+ AND p_size BETWEEN 1 AND 5
+ AND l_shipmode IN ('AIR', 'AIR REG')
+ AND l_shipinstruct = 'DELIVER IN PERSON'
+ )
+ OR (
+ p_partkey = l_partkey
+ AND p_brand = 'Brand#23'
+ AND p_container IN ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
+ AND l_quantity >= 10 AND l_quantity <= 10 + 10
+ AND p_size BETWEEN 1 AND 10
+ AND l_shipmode IN ('AIR', 'AIR REG')
+ AND l_shipinstruct = 'DELIVER IN PERSON'
+ )
+ OR (
+ p_partkey = l_partkey
+ AND p_brand = 'Brand#34'
+ AND p_container IN ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
+ AND l_quantity >= 20 AND l_quantity <= 20 + 10
+ AND p_size BETWEEN 1 AND 15
+ AND l_shipmode IN ('AIR', 'AIR REG')
+ AND l_shipinstruct = 'DELIVER IN PERSON'
+ );
\ No newline at end of file
diff --git a/sql/tpch/queries/q20.sql b/sql/tpch/queries/q20.sql
new file mode 100644
index 00000000..c2c6f75e
--- /dev/null
+++ b/sql/tpch/queries/q20.sql
@@ -0,0 +1,18 @@
+SELECT s_name,
+ s_address
+FROM supplier,
+ nation
+WHERE s_suppkey IN (SELECT ps_suppkey
+ FROM partsupp
+ WHERE ps_partkey IN (SELECT p_partkey
+ FROM part
+ WHERE p_name LIKE 'forest%')
+ AND ps_availqty > (SELECT 0.5 * SUM(l_quantity)
+ FROM lineitem
+ WHERE l_partkey = ps_partkey
+ AND l_suppkey = ps_suppkey
+ AND l_shipdate >= DATE '1994-01-01'
+ AND l_shipdate < DATE '1994-01-01' + interval '1' year))
+ AND s_nationkey = n_nationkey
+ AND n_name = 'CANADA'
+ORDER BY s_name;
\ No newline at end of file
diff --git a/sql/tpch/queries/q21.sql b/sql/tpch/queries/q21.sql
new file mode 100644
index 00000000..4b88bb9d
--- /dev/null
+++ b/sql/tpch/queries/q21.sql
@@ -0,0 +1,25 @@
+SELECT s_name,
+ COUNT(*) AS numwait
+FROM supplier,
+ lineitem l1,
+ orders,
+ nation
+WHERE s_suppkey = l1.l_suppkey
+ AND o_orderkey = l1.l_orderkey
+ AND o_orderstatus = 'F'
+ AND l1.l_receiptdate > l1.l_commitdate
+ AND EXISTS (SELECT *
+ FROM lineitem l2
+ WHERE l2.l_orderkey = l1.l_orderkey
+ AND l2.l_suppkey <> l1.l_suppkey)
+ AND NOT EXISTS (SELECT *
+ FROM lineitem l3
+ WHERE l3.l_orderkey = l1.l_orderkey
+ AND l3.l_suppkey <> l1.l_suppkey
+ AND l3.l_receiptdate > l3.l_commitdate)
+ AND s_nationkey = n_nationkey
+ AND n_name = 'SAUDI ARABIA'
+GROUP BY s_name
+ORDER BY numwait DESC,
+ s_name
+LIMIT 100;
\ No newline at end of file
diff --git a/sql/tpch/queries/q22.sql b/sql/tpch/queries/q22.sql
new file mode 100644
index 00000000..fb60fbc1
--- /dev/null
+++ b/sql/tpch/queries/q22.sql
@@ -0,0 +1,16 @@
+SELECT cntrycode,
+ COUNT(*) AS numcust,
+ SUM(c_acctbal) AS totacctbal
+FROM (SELECT SUBSTRING(c_phone FROM 1 FOR 2) AS cntrycode,
+ c_acctbal
+ FROM customer
+ WHERE SUBSTRING(c_phone FROM 1 FOR 2) IN ('13', '31', '23', '29', '30', '18', '17')
+ AND c_acctbal > (SELECT AVG(c_acctbal)
+ FROM customer
+ WHERE c_acctbal > 0.00
+ AND SUBSTRING(c_phone FROM 1 FOR 2) IN ('13', '31', '23', '29', '30', '18', '17'))
+ AND NOT EXISTS (SELECT *
+ FROM orders
+ WHERE o_custkey = c_custkey)) AS custsale
+GROUP BY cntrycode
+ORDER BY cntrycode;
\ No newline at end of file
diff --git a/src/actions/collect.py b/src/actions/collect.py
new file mode 100644
index 00000000..27cc6dd5
--- /dev/null
+++ b/src/actions/collect.py
@@ -0,0 +1,308 @@
+import subprocess
+
+import psycopg2
+from tqdm import tqdm
+
+from actions.collects.pg_unit import PgUnitGenerator
+from config import Config, DDLStep
+from models.factory import get_test_model
+from objects import EXPLAIN, ExplainFlags
+from utils import evaluate_sql, calculate_avg_execution_time, get_md5, allowed_diff, \
+ extract_execution_time_from_analyze, current_milli_time
+
+
+class CollectAction:
+ def __init__(self):
+ self.config = Config()
+ self.logger = self.config.logger
+ self.sut_database = self.config.database
+
+ def start_db(self):
+ self.logger.info(f"Initializing {self.sut_database.__class__.__name__} DB")
+
+ commit_hash = self.config.revision
+
+ self.sut_database.change_version_and_compile(commit_hash)
+ self.sut_database.stop_database()
+ self.sut_database.destroy()
+ self.sut_database.start_database()
+
+ return self.get_commit_message(commit_hash)
+
+ def get_commit_message(self, commit_hash):
+ if commit_hash:
+ output = str(subprocess.check_output(
+ f"echo `git log -n 1 --pretty=format:%s {commit_hash}`",
+ cwd=self.config.source_path,
+ shell=True)).rstrip('\n')
+ return f"{output} ({commit_hash})"
+ else:
+ return ""
+
+ def evaluate(self):
+ loader = self.config.database.get_results_loader()
+
+ self.start_db()
+ try:
+ self.sut_database.create_test_database()
+
+ self.sut_database.establish_connection(self.config.connection.database)
+
+ loq = self.config.database.get_list_queries()
+ with self.sut_database.connection.conn.cursor() as cur:
+ loq.git_message, loq.db_version = self.sut_database.get_revision_version(cur)
+ loq.database_config = self.sut_database.get_database_config(cur)
+
+ loq.ddl_execution_time, loq.model_execution_time, loq.model_queries, loq.queries = (
+ self.run_ddl_and_testing_queries(self.sut_database.connection.conn, self.config.with_optimizations))
+
+ loq.config = str(self.config)
+
+ self.logger.info(f"Storing results to report/{self.config.output}")
+ loader.store_queries_to_file(loq, self.config.output)
+ except Exception as e:
+ self.logger.exception(e)
+ raise e
+ finally:
+ if self.config.clean_db:
+ self.sut_database.drop_test_database()
+ self.sut_database.stop_database()
+
+ def run_ddl_and_testing_queries(self,
+ connection,
+ evaluate_optimizations=False):
+ try:
+ model = get_test_model()
+
+ ddl_start_time = current_milli_time()
+ created_tables, \
+ non_catalog_tables, \
+ teardown_queries, \
+ create_queries, \
+ analyze_queries, \
+ import_queries = model.create_tables(connection)
+ ddl_execution_time = int((current_milli_time() - ddl_start_time) / 1000)
+
+ model_queries = teardown_queries + create_queries + analyze_queries + import_queries
+ queries = model.get_queries(created_tables)
+
+ if DDLStep.COMPACT in self.config.ddls:
+ self.sut_database.run_compaction(tables=non_catalog_tables)
+ except Exception as e:
+ self.logger.exception("Failed to evaluate DDL queries", e)
+ exit(1)
+
+ connection.autocommit = False
+ model_start_time = current_milli_time()
+ self.evaluate_testing_queries(connection, queries, evaluate_optimizations)
+ model_execution_time = int((current_milli_time() - model_start_time) / 1000)
+
+ PgUnitGenerator().generate_postgres_unit_tests(teardown_queries,
+ create_queries,
+ queries)
+
+ return ddl_execution_time, model_execution_time, model_queries, queries
+
+ def evaluate_testing_queries(self, conn, queries, evaluate_optimizations):
+ counter = 1
+ for original_query in queries:
+ with conn.cursor() as cur:
+ try:
+ self.logger.info(
+ f"Evaluating query with hash {original_query.query_hash} [{counter}/{len(queries)}]")
+ self.sut_database.set_query_timeout(cur, self.config.test_query_timeout)
+
+ # get default execution plan
+ self.sut_database.prepare_query_execution(cur, original_query)
+ evaluate_sql(cur, original_query.get_explain(EXPLAIN))
+ default_execution_plan = self.config.database.get_execution_plan(
+ '\n'.join(str(item[0]) for item in cur.fetchall()))
+ conn.rollback()
+
+ # store default execution plan if query execution will fail
+ original_query.execution_plan = default_execution_plan
+
+ # get costs off execution plan
+ self.sut_database.prepare_query_execution(cur, original_query)
+ evaluate_sql(cur, original_query.get_explain(EXPLAIN, [ExplainFlags.COSTS_OFF]))
+ original_query.cost_off_explain = self.config.database.get_execution_plan(
+ '\n'.join(str(item[0]) for item in cur.fetchall()))
+ conn.rollback()
+
+ self.define_min_execution_time(conn, cur, original_query)
+
+ if self.config.plans_only:
+ original_query.execution_time_ms = default_execution_plan.get_estimated_cost()
+ else:
+ query_str = original_query.get_explain(EXPLAIN, options=[ExplainFlags.ANALYZE]) \
+ if self.config.server_side_execution else None
+ calculate_avg_execution_time(cur, original_query, self.sut_database,
+ query_str=query_str,
+ num_retries=int(self.config.num_retries),
+ connection=conn)
+
+ if evaluate_optimizations and "dml" not in original_query.optimizer_tips.tags:
+ self.logger.debug("Evaluating optimizations...")
+ self.evaluate_optimizations(conn, cur, original_query)
+
+ if not self.config.plans_only:
+ self.validate_result_hash(original_query)
+ self.validate_execution_time(original_query)
+
+ except psycopg2.Error as pe:
+ # do not raise exception
+ self.logger.exception(f"{original_query}\nFailed because of {pe}")
+ except Exception as e:
+ self.logger.info(original_query)
+ raise e
+ finally:
+ counter += 1
+
+ conn.rollback()
+
+ def validate_result_hash(self, original_query):
+ if "skip_consistency_check" in original_query.optimizer_tips.tags:
+ return
+
+ result_hash = original_query.result_hash
+ for optimization in original_query.optimizations:
+ # get first result hash as a default value in case of original hash is missing
+ if not result_hash and optimization.result_hash:
+ result_hash = optimization.result_hash
+
+ if optimization.result_hash and result_hash != optimization.result_hash:
+ cardinality_equality = "=" if original_query.result_cardinality == optimization.result_cardinality else "!="
+
+ if "now()" in original_query.query.lower():
+ # todo fixing result_hash for queries with function calls
+ optimization.query_hash = result_hash
+ continue
+
+ self.config.has_failures = True
+ self.logger.exception(f"UNSTABLE: INCONSISTENT RESULTS\n"
+ f"Validation: {result_hash} != {optimization.result_hash}\n"
+ f"Cardinality: {original_query.result_cardinality} {cardinality_equality} {optimization.result_cardinality}\n"
+ f"Reproducer original: {original_query.query}\n"
+ f"Reproducer optimization: /*+ {optimization.explain_hints} */ {optimization.query}\n")
+
+ if self.config.exit_on_fail:
+ exit(1)
+
+ def validate_execution_time(self, original_query):
+ explain_execution_time = extract_execution_time_from_analyze(original_query.execution_plan.full_str)
+ avg_execution_time = original_query.execution_time_ms
+
+ if explain_execution_time and (explain_execution_time > avg_execution_time and
+ not allowed_diff(self.config, avg_execution_time, explain_execution_time)):
+ self.config.has_warnings = True
+ self.logger.warning(f"UNSTABLE: WARNING\n"
+ f"ANALYZE query execution time is too large:\n"
+ f"Execution times (explain vs avg): {explain_execution_time} < {avg_execution_time}\n"
+ f"Query: {original_query.query}\n")
+
+ def define_min_execution_time(self, conn, cur, original_query):
+ if self.config.baseline_results:
+ if baseline_result := \
+ self.config.baseline_results.find_query_by_hash(original_query.query_hash):
+ # get the best optimization from baseline run
+ best_optimization = baseline_result.get_best_optimization(self.config)
+ query_str = best_optimization.get_explain(EXPLAIN, options=[ExplainFlags.ANALYZE]) \
+ if self.config.server_side_execution else None
+ calculate_avg_execution_time(cur,
+ best_optimization,
+ self.sut_database,
+ query_str=query_str,
+ num_retries=int(self.config.num_retries),
+ connection=conn)
+ self.set_query_timeout_based_on_previous_execution(cur,
+ best_optimization.execution_time_ms,
+ original_query)
+ else:
+ self.sut_database.set_query_timeout(cur, self.config.test_query_timeout)
+ else:
+ self.sut_database.set_query_timeout(cur, self.config.test_query_timeout)
+
+ def evaluate_optimizations(self, connection, cur, original_query):
+ # build all possible optimizations
+ database = self.config.database
+ list_of_optimizations = database.get_list_optimizations(original_query)
+
+ self.logger.debug(f"{len(list_of_optimizations)} optimizations generated")
+ progress_bar = tqdm(list_of_optimizations)
+ duplicates = 0
+ timed_out = 0
+ min_execution_time = original_query.execution_time_ms \
+ if original_query.execution_time_ms > 0 else (self.config.test_query_timeout * 1000)
+ original_query.optimizations = []
+ execution_plans_checked = set()
+
+ for optimization in progress_bar:
+ # in case of enable statistics enabled
+ # we can get failure here and throw timeout
+ original_query.optimizations.append(optimization)
+
+ # set maximum execution time if this is first query,
+ # or we are evaluating queries near best execution time
+ if self.config.look_near_best_plan or len(original_query.optimizations) == 1:
+ self.set_query_timeout_based_on_previous_execution(cur, min_execution_time, original_query)
+
+ # check that execution plan is unique
+ evaluate_sql(cur, optimization.get_explain(EXPLAIN, options=[ExplainFlags.COSTS_OFF]))
+ optimization.cost_off_explain = database.get_execution_plan(
+ '\n'.join(str(item[0]) for item in cur.fetchall())
+ )
+
+ self.try_to_get_default_explain_hints(optimization, original_query)
+
+ exec_plan_md5 = get_md5(optimization.cost_off_explain.get_clean_plan())
+ not_unique_plan = exec_plan_md5 in execution_plans_checked
+ execution_plans_checked.add(exec_plan_md5)
+ query_str = optimization.get_explain(EXPLAIN, options=[ExplainFlags.ANALYZE]) \
+ if self.config.server_side_execution else None
+
+ if not_unique_plan:
+ duplicates += 1
+ else:
+ try:
+ self.sut_database.prepare_query_execution(cur, optimization)
+ evaluate_sql(cur, optimization.get_explain(EXPLAIN))
+ default_execution_plan = database.get_execution_plan(
+ '\n'.join(str(item[0]) for item in cur.fetchall())
+ )
+ except psycopg2.errors.QueryCanceled as e:
+ # failed by timeout in getting EXPLAIN - issue
+ self.logger.exception(f"Getting default execution plan failed with {e}")
+ continue
+
+ if self.config.plans_only:
+ original_query.execution_plan = default_execution_plan
+ original_query.execution_time_ms = default_execution_plan.get_estimated_cost()
+ elif not calculate_avg_execution_time(
+ cur,
+ optimization,
+ self.sut_database,
+ query_str=query_str,
+ num_retries=int(self.config.num_retries),
+ connection=connection):
+ timed_out += 1
+
+ # get new minimum execution time
+ if 0 < optimization.execution_time_ms < min_execution_time:
+ min_execution_time = optimization.execution_time_ms
+
+ progress_bar.set_postfix(
+ {'skipped': f"(dp: {duplicates}, to: {timed_out})", 'min_time_ms': "{:.2f}".format(min_execution_time)})
+
+ return list_of_optimizations
+
+ def set_query_timeout_based_on_previous_execution(self, cur, min_execution_time, original_query):
+ optimizer_query_timeout = \
+ (original_query.optimizer_tips and original_query.optimizer_tips.max_timeout) or \
+ f"{int(min_execution_time / 1000) + int(self.config.skip_timeout_delta)}"
+ self.sut_database.set_query_timeout(cur, optimizer_query_timeout)
+
+ @staticmethod
+ def try_to_get_default_explain_hints(optimization, original_query):
+ if not original_query.explain_hints:
+ if original_query.compare_plans(optimization):
+ original_query.explain_hints = optimization.explain_hints
diff --git a/src/actions/collects/pg_unit.py b/src/actions/collects/pg_unit.py
new file mode 100644
index 00000000..568d9036
--- /dev/null
+++ b/src/actions/collects/pg_unit.py
@@ -0,0 +1,87 @@
+import os
+from inspect import cleandoc
+
+from config import Config
+from objects import ExplainFlags, EXPLAIN
+from utils import parse_clear_and_parametrized_sql
+
+
+class PgUnitGenerator:
+
+ def __init__(self):
+ self.config = Config()
+
+ @staticmethod
+ def add_semicolon(line: str):
+ if line.endswith(";"):
+ return line
+ else:
+ return f"{line};"
+
+ @staticmethod
+ def wrap_query_plan(plan: str):
+ num_rows = len(plan.split("\n"))
+ num_rows_unit = "row" if num_rows == 1 else "rows"
+ max_length = max(len(line) for line in plan.split("\n")) + 2
+ plan = "\n".join([f" {row}" for row in plan.split("\n")])
+
+ return f"""{' ' * int(max_length / 2 - 5)}QUERY PLAN
+{('-' * max_length)}
+{plan}
+({num_rows} {num_rows_unit})
+
+"""
+
+ def generate_postgres_unit_tests(self, teardown_queries, create_queries, queries):
+ try:
+ if not os.path.isdir("report"):
+ os.mkdir("report")
+
+ # generate sql file
+ with open(f"report/{self.config.output}_pgunit.sql", "w") as result_file:
+ self.generate_output_file(create_queries, queries, result_file, teardown_queries, with_output=False)
+
+ # generate out file
+ with open(f"report/{self.config.output}_pgunit.out", "w") as result_file:
+ self.generate_output_file(create_queries, queries, result_file, teardown_queries, with_output=True)
+ except Exception:
+ # TODO fix here - there will be no PG results
+ self.config.logger.exception("Failed to generate unit files")
+
+ def generate_output_file(self, create_queries, queries, result_file, teardown_queries, with_output=False):
+ result_file.write(f"CREATE DATABASE {self.config.connection.database} with colocation = true;\n")
+ result_file.write(f"\c {self.config.connection.database}\n")
+ result_file.write(f"SET statement_timeout = '{self.config.ddl_query_timeout}s';\n")
+
+ for model_query in create_queries:
+ if model_query.startswith("--"):
+ result_file.write(model_query)
+ else:
+ result_file.write(self.add_semicolon(' '.join(model_query.split())))
+ result_file.write("\n")
+
+ result_file.write("\n")
+ result_file.write("-- TBD: ADD STATISTICS IMPORT QUERIES\n")
+ result_file.write("\n")
+
+ for session_prop in self.config.session_props:
+ result_file.write(self.add_semicolon(session_prop))
+ result_file.write("\n")
+
+ for query in queries:
+ best_found = " , !BEST_PLAN_FOUND" if not query.compare_plans(query.get_best_optimization(self.config)) else ""
+
+ result_file.write(f"-- Query Hash: {query.query_hash}{best_found}\n")
+ _, _, clean_query = parse_clear_and_parametrized_sql(query.get_explain(EXPLAIN, options=[ExplainFlags.COSTS_OFF]))
+ result_file.write(cleandoc(self.add_semicolon(clean_query)))
+
+ result_file.write("\n")
+
+ if with_output:
+ result_file.write(self.wrap_query_plan(query.cost_off_explain.full_str))
+
+ for model_query in teardown_queries:
+ result_file.write(self.add_semicolon(model_query))
+ result_file.write("\n")
+
+ result_file.write("\n")
diff --git a/src/actions/report.py b/src/actions/report.py
new file mode 100644
index 00000000..c26a9481
--- /dev/null
+++ b/src/actions/report.py
@@ -0,0 +1,165 @@
+import os
+import shutil
+import subprocess
+import time
+import numpy as np
+from pathlib import Path
+
+from config import Config
+
+
+class ObjectsMixin:
+ def __int__(self):
+ self.content = ""
+
+ def add_double_newline(self):
+ self.content += "\n\n"
+
+ def start_table(self, columns: str = "1"):
+ self.content += f"[cols=\"{columns}\"]\n" \
+ "|===\n"
+
+ def start_table_row(self):
+ self.content += "a|"
+
+ def end_table_row(self):
+ self.content += "\n"
+
+ def end_table(self):
+ self.content += "|===\n"
+
+ def start_source(self, additional_tags=None, linenums=True):
+ tags = f",{','.join(additional_tags)}" if additional_tags else ""
+ tags += ",linenums" if linenums else ""
+
+ self.content += f"[source{tags}]\n----\n"
+
+ def end_source(self):
+ self.content += "\n----\n"
+
+ def start_collapsible(self, name, sep='===='):
+ self.content += f"""\n\n.{name}\n[%collapsible]\n{sep}\n"""
+
+ def end_collapsible(self, sep='===='):
+ self.content += f"""\n{sep}\n\n"""
+
+
+class AbstractReportAction(ObjectsMixin):
+ def __init__(self, create_folders: bool = True):
+ super().__init__()
+
+ self.config = Config()
+ self.logger = self.config.logger
+
+ self.content = f"= Optimizer {self.get_report_name()} Test Report \n" \
+ f":source-highlighter: coderay\n" \
+ f":coderay-linenums-mode: inline\n\n"
+
+ self.start_collapsible("Reporting configuration")
+ self.start_source()
+ self.content += str(self.config)
+ self.end_source()
+ self.end_collapsible()
+
+ self.reported_queries_counter = 0
+ self.queries = []
+ self.sub_reports = []
+
+ self.start_date = time.strftime("%Y%m%d-%H%M%S")
+
+ if create_folders:
+ self.report_folder = f"report/{self.start_date}"
+ self.report_folder_imgs = f"report/{self.start_date}/imgs"
+ self.report_folder_tags = f"report/{self.start_date}/tags"
+
+ if self.config.clear:
+ self.logger.info("Clearing report directory")
+ shutil.rmtree("report", ignore_errors=True)
+
+ if not os.path.isdir("report"):
+ os.mkdir("report")
+
+ if not os.path.isdir(self.report_folder):
+ os.mkdir(self.report_folder)
+ os.mkdir(self.report_folder_imgs)
+ os.mkdir(self.report_folder_tags)
+
+ def get_report_name(self):
+ return ""
+
+ def geo_mean(self, iterable):
+ return np.exp(np.log(iterable).mean())
+
+ def report_model(self, model_queries):
+ if model_queries:
+ self.start_collapsible("Model queries")
+ self.start_source(["sql"])
+ self.content += "\n".join(
+ [query if query.endswith(";") else f"{query};" for query in model_queries])
+ self.end_source()
+ self.end_collapsible()
+
+ def report_config(self, config, collapsible_name):
+ if config:
+ self.start_collapsible(f"Collect configuration {collapsible_name}")
+ self.start_source(["sql"])
+ self.content += config
+ self.end_source()
+ self.end_collapsible()
+
+ def create_sub_report(self, name):
+ subreport = SubReport(name)
+ self.sub_reports.append(subreport)
+ return subreport
+
+ def publish_report(self, report_name):
+ index_html = f"{self.report_folder}/index_{self.config.output}.adoc"
+
+ with open(index_html, "w") as file:
+ file.write(self.content)
+
+ for sub_report in self.sub_reports:
+ with open(f"{self.report_folder_tags}/{sub_report.name}.adoc", "w") as file:
+ file.write(sub_report.content)
+
+ self.logger.info(f"Generating report file from {index_html} and compiling html")
+ asciidoc_return_code = subprocess.run(
+ f'{self.config.asciidoctor_path} '
+ f'-a stylesheet={os.path.abspath("adoc/adoc.css")} '
+ f'{index_html}',
+ shell=True).returncode
+
+ if self.sub_reports:
+ self.logger.info(f"Compiling {len(self.sub_reports)} subreports to html")
+ for sub_report in self.sub_reports:
+ subprocess.call(
+ f'{self.config.asciidoctor_path} '
+ f'-a stylesheet={os.path.abspath("adoc/adoc.css")} '
+ f'-r {os.path.abspath("adoc/copy-to-clipboard-docinfo-processor.rb")} '
+ f"{self.report_folder_tags}/{sub_report.name}.adoc",
+ shell=True)
+
+ if asciidoc_return_code != 0:
+ self.logger.exception("Failed to generate HTML file! Check asciidoctor path")
+ else:
+ report_html_path = Path(f'{self.report_folder}/index_{self.config.output}.html')
+ self.logger.info(f"Done! Check report at {report_html_path.absolute()}")
+
+ def append_tag_page_link(self, subreport_name: str, hashtag: str | None, readable_name: str):
+ hashtag = f"#{hashtag}" if hashtag else ""
+ self.content += f"\nlink:tags/{subreport_name}.html{hashtag}[{readable_name}]\n"
+
+
+class SubReport(ObjectsMixin):
+ def __init__(self, name):
+ self.config = Config()
+ self.logger = self.config.logger
+
+ self.name = name
+ self.content = f"= {name} subreport \n" \
+ f":source-highlighter: coderay\n" \
+ f":coderay-linenums-mode: inline\n\n"
+
+ def append_index_page_hashtag_link(self, hashtag: str, readable_name: str):
+ hashtag = f"#{hashtag}" if hashtag else ""
+ self.content += f"\nlink:../index_{self.config.output}.html{hashtag}[{readable_name}]\n"
diff --git a/src/actions/reports/cost.py b/src/actions/reports/cost.py
new file mode 100644
index 00000000..90a20cc1
--- /dev/null
+++ b/src/actions/reports/cost.py
@@ -0,0 +1,695 @@
+import html
+import inspect
+import re
+from collections.abc import Iterable
+from operator import attrgetter
+from urllib.parse import quote as url_quote
+
+import numpy as np
+from matplotlib import pyplot as plt
+from matplotlib import rcParams
+from sql_formatter.core import format_sql
+
+from collect import CollectResult
+from objects import PlanNode, PlanPrinter, Query, ScanNode
+from actions.report import AbstractReportAction, SubReport
+from actions.reports.cost_metrics import CostMetrics
+from actions.reports.cost_chart_specs import (CostChartSpecs, ChartGroup, ChartSpec,
+ DataPoint, PlotType)
+
+
+IMAGE_FILE_SUFFIX = '.svg'
+
+
+BOXPLOT_DESCRIPTION = (
+ '=== Boxplot Chart\n'
+ '\n```\n'
+ ' Q1-1.5IQR Q1 median Q3 Q3+1.5IQR\n'
+ ' (or min) |-----:-----| (or max)\n'
+ ' o |--------| : |--------| o o\n'
+ ' |-----:-----|\n'
+ 'flier <-----------> fliers\n'
+ ' IQR\n'
+ ' median: orange solid line\n'
+ ' mean: green dashed line\n'
+ '```\n'
+ '\n* Each box represents the inter-quartile range: [Q1 .. Q3] (+/- 25% from the median).\n'
+ '\n* The whiskers extending from the box represent 1.5 x the inter-quantile range.\n'
+ '\n* The bubbles beyond the whiskers represent fliers (outliers).\n'
+ '\n* References:\n'
+ '\n ** https://en.wikipedia.org/wiki/Box_plot\n'
+ '\n ** http://vita.had.co.nz/papers/boxplots.pdf\n'
+ '\n'
+)
+
+X_TIME_COST_CHART_DESCRIPTION = (
+ '=== X-Time-Cost Relationship Charts\n'
+ '\n* Each chartset consists of three charts: (x, y) = (cost, exec time), (x-metric, exec time)'
+ ' and (x-metric, cost) where `x-metric` is the metric of interest that affects the execution'
+ ' time such as node input-output row count ratio, node output data size, etc.'
+ '\n\n* The costs are adjusted by the actual row count using the following formula'
+ ' unless noted otherwise.\n'
+ '\n (total_cost - startup_cost) * actual_rows / estimated_rows + startup_cost\n'
+ '\n'
+)
+
+
+class CostReport(AbstractReportAction):
+ def __init__(self, model):
+ super().__init__()
+ self.cm = CostMetrics(model)
+ self.cs = CostChartSpecs(self.cm)
+ self.interactive = False
+ self.report_location = f'report/{self.start_date}'
+ self.image_folder = 'imgs'
+
+ def get_image_location(self):
+ return f'{self.report_location}/{self.image_folder}/'
+
+ def make_image_block(self, file_name, attrs, is_sub_report=False):
+ pre = 'image::'
+ if is_sub_report:
+ pre += '../'
+ return f'{pre}{self.image_folder}/{file_name}[{attrs}]\n'
+
+ @classmethod
+ def generate_report(cls, loq: CollectResult, interactive):
+ model = m.group(1) if (m := re.search(r"'model': '((?:\w|-)*)'", loq.config)) else ''
+ report = CostReport(model)
+ cm = report.cm
+ cs = report.cs
+ report.interactive = interactive
+
+ chart_specs = list()
+ if interactive:
+ chart_specs = report.choose_chart_spec(cs.get_xtc_chart_specs()
+ + cs.get_exp_chart_specs())
+ else:
+ report.define_version(loq.db_version)
+ report.report_config(loq.config, "YB")
+ report.report_model(loq.model_queries)
+
+ report.logger.info('Processing queries...')
+ for query in sorted(loq.queries, key=attrgetter('tag', 'query')):
+ cm.add_query(query)
+
+ report.logger.info(f"Processed {len(loq.queries)} queries {cm.num_plans} plans")
+ if cm.num_no_opt_queries:
+ report.logger.warn(f"Queries without non-default plans: {cm.num_no_opt_queries}")
+ if cm.num_invalid_cost_plans:
+ report.logger.warn(f"Plans with invalid costs: {cm.num_invalid_cost_plans}"
+ f", fixed: {cm.num_invalid_cost_plans_fixed}")
+
+ # for now, print and run the queries then populate self.index_prefix_gap_map manually.
+ # TODO: move collect and record the ndv to the "collection" step via flag.
+ # cm.build_index_prefix_gap_queries()
+ # return
+
+ if interactive:
+ report.collect_nodes_and_create_plots(chart_specs)
+ else:
+ report.collect_nodes_and_create_plots(
+ cs.get_dist_chart_specs()
+ + cs.get_xtc_chart_specs()
+ + cs.get_exp_chart_specs()
+ + cs.get_more_exp_chart_specs()
+ )
+
+ report.build_report()
+ report.publish_report("cost")
+
+ def get_report_name(self):
+ return "cost validation"
+
+ def define_version(self, version):
+ self.content += f"[VERSION]\n====\n{version}\n====\n\n"
+
+ def build_report(self):
+ id = 0
+
+ self.content += "\n== Time & Cost Distribution Charts\n"
+ self.content += "\n<<_boxplot_chart, Boxplot distribution chart description>>\n"
+ self.report_chart_groups(id, self.cs.dist_chart_groups)
+
+ self.content += "\n== Time - Cost Relationship Charts\n"
+ self.content += "\n<<_x_time_cost_relationship_charts, time - cost chart description>>\n"
+ id += self.report_chart_groups(id, self.cs.xtc_chart_groups)
+
+ id += self.report_chart_groups(id, self.cs.exp_chart_groups)
+ id += self.report_chart_groups(id, self.cs.more_exp_chart_groups)
+
+ self.content += "== All Queries\n"
+ self.build_all_plan_subreports()
+
+ self.content += "== Chart Descriptions\n"
+ self.content += BOXPLOT_DESCRIPTION
+ self.content += X_TIME_COST_CHART_DESCRIPTION
+
+ def report_chart_groups(self, start_id: int, chart_groups: Iterable[ChartGroup]):
+ id = start_id
+ cols = 3
+ i = 0
+ for cg in chart_groups:
+ self.content += f"\n=== {cg.title}\n"
+ self.content += f"\n{cg.description}\n"
+ self.start_table(cols)
+ title_row = ''
+ image_row = ''
+ for spec in filter(lambda s: bool(s.file_name), cg.chart_specs):
+ sub_report_tag = spec.file_name.replace(IMAGE_FILE_SUFFIX, '')
+ title = html.escape(f'{id} {spec.title}')
+ sub_report = self.create_sub_report(sub_report_tag)
+ sub_report.content += f"\n[#{sub_report_tag}]\n"
+ sub_report.content += f"== {title}\n\n{spec.description}\n\n"
+ self.report_chart(sub_report, spec)
+ dpstr = f'{sum([len(dp) for dp in spec.series_data.values()])} data points'
+ olstr = (f' after excluding #{sum([len(dp) for dp in spec.outliers.values()])}'
+ ' extreme outliers#') if spec.outliers else ''
+ title_row += f'|{title} +\n({dpstr}{olstr})'
+
+ image_row += 'a|'
+ image_attrs = (f'link="tags/{sub_report_tag}.html",align="center"')
+ image_row += self.make_image_block(spec.file_name, image_attrs)
+ if i % cols == 2:
+ self.content += title_row
+ self.content += '\n\n'
+ self.content += image_row
+ title_row = ''
+ image_row = ''
+ i += 1
+ id += 1
+ while i % cols != 0:
+ title_row += '|'
+ image_row += 'a|\n'
+ i += 1
+ self.content += title_row
+ self.content += '\n'
+ self.content += image_row
+
+ self.end_table()
+ return id - start_id
+
+ @staticmethod
+ def report_chart_filters(report: SubReport, spec: ChartSpec):
+ report.start_collapsible("Chart specifications")
+ report.start_source(["python"])
+ report.content += "=== Query Filters ===\n"
+ for f in spec.query_filter, *spec.xtra_query_filter_list:
+ report.content += inspect.getsource(f)
+ report.content += "=== Node Filters ===\n"
+ for f in spec.node_filter, *spec.xtra_node_filter_list:
+ report.content += inspect.getsource(f)
+ report.content += "=== X Axsis Data ===\n"
+ report.content += inspect.getsource(spec.x_getter)
+ report.content += "=== Series Suffix ===\n"
+ report.content += inspect.getsource(spec.series_suffix)
+ report.content += "=== Options ===\n"
+ report.content += str(spec.options)
+ report.end_source()
+ report.end_collapsible()
+
+ @staticmethod
+ def report_queries(report: SubReport, queries):
+ report.start_collapsible(f"Queries ({len(queries)})")
+ report.start_source(["sql"])
+ report.content += "\n".join([query if query.endswith(";") else f"{query};"
+ for query in sorted(queries)])
+ report.end_source()
+ report.end_collapsible()
+
+ @staticmethod
+ def make_plan_node_link(cm: CostMetrics, node: PlanNode):
+ query = cm.get_node_query(node)
+ sqlfile = cm.get_node_parent_query(node).tag
+ anchor = query.query_hash + (CostReport.make_name(query.explain_hints)
+ if query.explain_hints else '')
+
+ node_str = node.get_full_str(estimate=True, actual=True, properties=False, level=False)
+ if isinstance(node, ScanNode):
+ pat_str = r' on (?P\S+\.)' + node.table_name + ' '
+ if m := re.search(pat_str, query.execution_plan.full_str):
+ node_str = node_str.replace(' on ' + node.table_name,
+ ' on ' + m.group('schema') + node.table_name)
+ highlight = url_quote(node_str, safe='/()')
+ return f"{sqlfile}.html#{anchor}:~:text={highlight}"
+
+ @staticmethod
+ def report_plot_data(report: SubReport, cm: CostMetrics, title,
+ plot_data, data_labels, outliers):
+ num_dp = sum([len(dp) for dp in plot_data.values()])
+ enh = '#' if outliers else ''
+ report.start_collapsible(f"{enh}{title} ({num_dp}){enh}", sep="=====")
+ report.content += "'''\n"
+ if plot_data:
+ table_header = '|'.join(data_labels)
+ table_header += '\n'
+ for series_label, data_points in sorted(plot_data.items()):
+ report.start_collapsible(f"`{series_label}` ({len(data_points)})")
+ report.start_table('<1m,2*^1m,8a,1a')
+ report.start_table_row()
+ report.content += table_header
+ report.end_table_row()
+ for x, cost, time_ms, node in sorted(data_points,
+ key=attrgetter('x', 'time_ms', 'cost'),
+ reverse=outliers):
+ report.content += f">|{x:.3f}\n>|{time_ms:.3f}\n>|{cost:.3f}\n|\n"
+ report.start_source(["sql"], linenums=False)
+ report.content += str(node)
+ report.end_source()
+ report.content += '\n|'
+ report.content += (f'link:{CostReport.make_plan_node_link(cm, node)}'
+ f'[{cm.get_node_query(node).query_hash}]')
+ report.content += '\n'
+
+ report.end_table()
+ report.end_collapsible()
+
+ report.content += "'''\n"
+ report.end_collapsible(sep="=====")
+
+ @staticmethod
+ def report_stats(report: SubReport, spec: ChartSpec):
+ report.start_table('3,8*^1m')
+ report.content += f'|{html.escape(spec.ylabel1)}'
+ report.content += '|p0 (min)'
+ report.content += '|p25 (Q1)'
+ report.content += '|p50{nbsp}(median)'
+ report.content += '|mean'
+ report.content += '|p75 (Q3)'
+ report.content += '|p100 (max)'
+ report.content += '|IQR (Q3-Q1)'
+ report.content += '|SD\n\n'
+
+ for series_label, data_points in sorted(spec.series_data.items()):
+ transposed_data = np.split(np.array(data_points).transpose(), len(DataPoint._fields))
+ xdata = transposed_data[0][0]
+ ptile = np.percentile(xdata, [0, 25, 50, 75, 100])
+ report.content += f'|{series_label}\n'
+ report.content += f'>|{ptile[0]:.3f}\n'
+ report.content += f'>|{ptile[1]:.3f}\n'
+ report.content += f'>|{ptile[2]:.3f}\n'
+ report.content += f'>|{np.mean(xdata):.3f}\n'
+ report.content += f'>|{ptile[3]:.3f}\n'
+ report.content += f'>|{ptile[4]:.3f}\n'
+ report.content += f'>|{ptile[3] - ptile[1]:.3f}\n'
+ report.content += f'>|{np.std(xdata):.3f}\n'
+
+ report.end_table()
+
+ def report_chart(self, report: SubReport, spec: ChartSpec):
+ report.start_table()
+ report.content += 'a|'
+ report.content += self.make_image_block(spec.file_name, f'{spec.title},align="center"',
+ True)
+ report.end_table()
+ if spec.is_boxplot():
+ CostReport.report_stats(report, spec)
+
+ CostReport.report_chart_filters(report, spec)
+ CostReport.report_queries(report, spec.queries)
+ cost_label = ('adjusted ' if spec.options.adjust_cost_by_actual_rows else '') + 'cost'
+ data_labels = [f'{html.escape(spec.xlabel)}', 'time_ms', cost_label, 'node', 'query hash']
+ if spec.outliers:
+ CostReport.report_plot_data(report, self.cm,
+ ''.join(['Extreme ',
+ spec.outlier_axis,
+ ' outliers excluded from the plots']),
+ spec.outliers, data_labels, outliers=True)
+ CostReport.report_plot_data(report, self.cm, 'Plot data',
+ spec.series_data, data_labels, outliers=False)
+
+ def build_all_plan_subreports(self):
+ tags = dict()
+ for qctx in self.cm.query_context_map.values():
+ if not qctx.query.tag:
+ self.logger.warn(f'Found "Query" without tag: {qctx.query.query}')
+ tags.setdefault(qctx.query.tag, list()).append(qctx.query)
+
+ for tag, queries in sorted(tags.items()):
+ queries.sort(key=attrgetter('query'))
+ sub_report = self.create_sub_report(tag)
+ self.report_all_plans(sub_report, queries)
+
+ self.start_collapsible(f"link:tags/{tag}.html[{tag}.sql]")
+ for query in queries:
+ self.content += f"\n[#{query.query_hash}_top]\n"
+ self.content += f"link:tags/{tag}.html#{query.query_hash}[{query.query_hash}]\n"
+ self.start_source(["sql"])
+ self.content += format_sql(query.get_reportable_query())
+ self.end_source()
+ self.content += '\n\n'
+ self.end_collapsible()
+
+ @staticmethod
+ def report_all_plans(report: SubReport, queries: Iterable[Query]):
+ for query in queries:
+ report.content += f"\n[#{query.query_hash}]\n"
+ report.content += f"== Query {query.query_hash}\n\n"
+ report.append_index_page_hashtag_link("top", "Go to index")
+ report.append_index_page_hashtag_link(f"{query.query_hash}_top", "Show in summary")
+ report.add_double_newline()
+
+ report.start_source(["sql"])
+ report.content += format_sql(query.get_reportable_query())
+ report.end_source()
+
+ report.content += '=== No hint\n'
+ report.start_source(["diff"])
+ report.content += query.execution_plan.full_str
+ report.end_source()
+
+ for opt in query.optimizations:
+ if not opt.execution_plan:
+ continue
+ anchor = query.query_hash + CostReport.make_name(opt.explain_hints)
+ report.content += f"\n[#{anchor}]\n"
+ report.content += f'=== Hints: [`{opt.explain_hints}`]\n'
+ report.start_source(["diff"])
+ report.content += opt.execution_plan.full_str
+ report.end_source()
+
+ @staticmethod
+ def get_series_color(series_label):
+ # choices of colors = [ 'b', 'g', 'r', 'c', 'm', 'k' ]
+ if 'Seq Scan' in series_label:
+ return 'b'
+ elif re.search(r'Index Scan.*\(PK\)', series_label):
+ return 'm'
+ elif 'Index Scan' in series_label:
+ return 'r'
+ elif 'Index Only Scan' in series_label:
+ return 'g'
+ return 'k'
+
+ def collect_nodes_and_create_plots(self, specs: Iterable[ChartSpec]):
+ self.logger.info('Collecting data points...')
+
+ for spec in specs:
+ for query_str, table_node_list_map in self.cm.query_table_node_map.items():
+ if not spec.test_query(query_str):
+ continue
+ for node_list in table_node_list_map.values():
+ for node in node_list:
+ if not spec.test_node(node):
+ continue
+
+ spec.queries.add(query_str)
+
+ multiplier = (int(node.nloops)
+ if spec.options.multipy_by_nloops else 1)
+
+ xdata = round(float(spec.x_getter(node)), 3)
+ cost = round(multiplier * float(node.get_actual_row_adjusted_cost()
+ if spec.options.adjust_cost_by_actual_rows
+ else node.total_cost), 3)
+ time_ms = round(float(node.total_ms) * multiplier, 3)
+
+ if node.is_seq_scan:
+ series_label = 'Seq Scan'
+ elif node.is_any_index_scan:
+ series_label = ''.join([
+ f"{node.node_type}",
+ (' (PK)' if node.index_name.endswith('_pkey') else ''),
+ (' Backward' if node.is_backward else ''),
+ ])
+ else:
+ series_label = node.name
+
+ if suffix := spec.series_suffix(node):
+ series_label += f' {suffix}'
+
+ spec.series_data.setdefault(series_label, list()).append(
+ DataPoint(xdata, cost, time_ms, node))
+
+ self.logger.info('Generating plots...')
+
+ marker_style = ['.', 'o', 'v', '^', '<',
+ '>', '8', 's', 'p', '*',
+ 'h', 'H', 'D', 'd', 'P', 'X']
+ line_style = ['-', '--', '-.', ':']
+
+ plotters = {
+ PlotType.BOXPLOT: CostReport.draw_boxplot,
+ PlotType.X_TIME_COST_PLOT: CostReport.draw_x_time_cost_plot,
+ }
+
+ for spec in specs:
+ for i, (series_label, data_points) in enumerate(sorted(spec.series_data.items())):
+ fmt = self.get_series_color(series_label)
+ fmt += marker_style[(i+3) % len(marker_style)]
+ fmt += line_style[(i+5) % len(line_style)]
+ spec.series_format[series_label] = fmt
+
+ plotters[spec.plotter](self, spec)
+
+ def choose_chart_spec(self, chart_specs):
+ choices = '\n'.join([f'{n}: {s.title}' for n, s in enumerate(chart_specs)])
+ while True:
+ try:
+ response = int(input(f'{choices}\n[0-{len(chart_specs)-1}] --> '))
+ if response < 0 or response >= len(chart_specs):
+ raise ValueError
+ break
+ except ValueError:
+ print(f"*** Enter a number in range [0..{len(chart_specs)-1}] ***")
+ response = -1
+ return [chart_specs[int(response)]]
+
+ __xtab = str.maketrans(" !\"#$%&'()*+,./:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^`{|}~",
+ "---------------------abcdefghijklmnopqrstuvwxyz---------")
+
+ @staticmethod
+ def make_name(text):
+ return text.translate(CostReport.__xtab).strip('-')
+
+ __fno = -1
+
+ @staticmethod
+ def make_file_name(text: str):
+ CostReport.__fno += 1
+ return (f"{CostReport.__fno:06d}-"
+ f"{re.sub(r'-+', '-', CostReport.make_name(text))}"
+ f"{IMAGE_FILE_SUFFIX}")
+
+ def draw_x_time_cost_plot(self, spec):
+ title = spec.title
+ xy_labels = [spec.xlabel,
+ spec.ylabel1 + (' (adjusted)'
+ if spec.options.adjust_cost_by_actual_rows else ''),
+ spec.ylabel2]
+
+ rcParams['font.family'] = 'serif'
+ rcParams['font.size'] = 10
+
+ fig, axs = plt.subplots(1, 3, figsize=(27, 8), layout='constrained')
+ fig.suptitle(title, fontsize='xx-large')
+
+ chart_ix = [(1, 2), (0, 2), (0, 1)] # cost-time, x-time, x-cost
+ log_scale_axis = [spec.options.log_scale_x,
+ spec.options.log_scale_cost,
+ spec.options.log_scale_time]
+ for i in range(len(chart_ix)):
+ ax = axs[i]
+ x, y = chart_ix[i]
+ xlabel = xy_labels[x] + (' (log)' if log_scale_axis[x] else '')
+ ylabel = xy_labels[y] + (' (log)' if log_scale_axis[y] else '')
+
+ ax.set_box_aspect(1)
+ ax.set_title(f'{xlabel} - {ylabel}', fontsize='x-large')
+ ax.set_xlabel(xlabel)
+ ax.set_ylabel(ylabel)
+ if not spec.series_data:
+ ax.text(0.5, 0.5, "NO DATA", size=50, family='sans serif', rotation=30.,
+ ha="center", va="center", alpha=0.4)
+
+ for series_label, data_points in sorted(spec.series_data.items()):
+ data_points.sort(key=attrgetter('x', 'time_ms', 'cost'))
+ transposed_data = np.split(np.array(data_points).transpose(), len(DataPoint._fields))
+ cost_per_time = transposed_data[1][0] / transposed_data[2][0]
+ if (iqr := np.subtract(*np.percentile(cost_per_time, [75, 25]))) > 0:
+ indices = np.nonzero(cost_per_time >
+ (np.percentile(cost_per_time, [75]) + 4 * iqr))[0]
+ if len(indices):
+ outliers = list()
+ for ix in reversed(indices):
+ outliers.append(data_points[ix])
+ del data_points[ix]
+
+ outliers.sort(key=attrgetter('cost', 'x', 'time_ms'), reverse=True)
+ spec.outliers[series_label] = outliers
+ transposed_data = np.split(np.array(data_points).transpose(),
+ len(DataPoint._fields))
+
+ for i in range(len(chart_ix)):
+ x, y = chart_ix[i]
+ ax = axs[i]
+ ax.plot(transposed_data[x][0],
+ transposed_data[y][0],
+ spec.series_format[series_label],
+ label=series_label,
+ alpha=0.35,
+ picker=self.line_picker)
+
+ if log_scale_axis[x]:
+ ax.set_xscale('log')
+ ax.set_xbound(lower=1.0)
+ else:
+ ax.set_xbound(lower=0.0)
+
+ if log_scale_axis[y]:
+ ax.set_yscale('log')
+ ax.set_ybound(lower=1.0)
+ else:
+ ax.set_ybound(lower=0.0)
+
+ if self.interactive:
+ [self.logger.debug(query_str) for query_str in sorted(spec.queries)]
+ self.show_charts_and_handle_events(spec, fig, axs)
+ else:
+ if spec.series_data:
+ # show the legend on the last subplot
+ axs[-1].legend(fontsize='xx-small',
+ ncols=int((len(spec.series_data.keys())+39)/40.0))
+
+ spec.file_name = self.make_file_name('-'.join([title, xlabel]))
+ plt.savefig(self.get_image_location() + spec.file_name,
+ dpi=50 if spec.series_data else 300)
+
+ plt.close()
+
+ def draw_boxplot(self, spec):
+ title = spec.title
+ xlabel = spec.xlabel
+ ylabel = spec.ylabel1
+
+ rcParams['font.family'] = 'serif'
+ rcParams['font.size'] = 10
+
+ fig, ax = plt.subplots(1, figsize=(12, 2.7), layout='constrained')
+
+ ax.set_title(title, fontsize='large')
+ ax.set_xlabel(xlabel)
+ ax.set_ylabel(ylabel)
+ if not spec.series_data:
+ ax.text(0.5, 0.5, "NO DATA", size=50, family='sans serif', rotation=30.,
+ ha="center", va="center", alpha=0.4)
+
+ data = list()
+ labels = list()
+ for series_label, data_points in sorted(spec.series_data.items()):
+ transposed_data = np.split(np.array(data_points).transpose(), len(DataPoint._fields))
+ xdata = transposed_data[0][0]
+ if (iqr := np.subtract(*np.percentile(xdata, [75, 25]))) > 0:
+ indices = np.nonzero(xdata > (np.percentile(xdata, [75]) + 3 * iqr))[0]
+ if len(indices):
+ outliers = list()
+ for ix in reversed(indices):
+ outliers.append(data_points[ix])
+ del data_points[ix]
+
+ spec.outlier_axis = "x-axis value"
+ outliers.sort(key=attrgetter('x', 'time_ms', 'cost'), reverse=True)
+ spec.outliers[series_label] = outliers
+ xdata = np.delete(xdata, indices, axis=0)
+
+ data.append(xdata)
+ labels.append(series_label)
+
+ ax.boxplot(data, labels=labels, vert=False, meanline=True, showmeans=True,
+ sym=None if spec.options.bp_show_fliers else '')
+
+ ax.xaxis.grid(True)
+
+ if spec.options.log_scale_x:
+ ax.set_xscale('log')
+
+ if self.interactive:
+ self.show_charts_and_handle_events(spec, fig, [ax])
+ else:
+ spec.file_name = self.make_file_name('-'.join([title, xlabel]))
+ plt.savefig(self.get_image_location() + spec.file_name,
+ dpi=50 if spec.series_data else 300)
+
+ plt.close()
+
+ @staticmethod
+ def line_picker(line, event):
+ if event.xdata is None:
+ return False, dict()
+ ax = event.inaxes
+ # convert to display pixel coordinate
+ [x], [y] = np.split(ax.transData.transform(line.get_xydata()).T, 2)
+ (event_x, event_y) = (event.x, event.y)
+ maxd = 10 # pixel radius from the pick event point
+
+ d = np.sqrt((x - event_x)**2 + (y - event_y)**2)
+ # print(f'line={line}\n' \
+ # f'x={x}\ny={y}\n' \
+ # f'event_x={event_x} event_y={event_y}\n' \
+ # f'd={d}\n' \
+ # f'ind where (d <= maxd)={np.nonzero(d <= maxd)}')
+ ind, = np.nonzero(d <= maxd)
+ if len(ind):
+ pickx = line.get_xdata()[ind]
+ picky = line.get_ydata()[ind]
+ [axxy] = ax.transAxes.inverted().transform([(event.x, event.y)])
+ props = dict(line=line, ind=ind, pickx=pickx, picky=picky,
+ axx=axxy[0], axy=axxy[1])
+ return True, props
+ else:
+ return False, dict()
+
+ def show_charts_and_handle_events(self, spec, fig, axs):
+ def on_pick(event):
+ ann = anns[id(event.mouseevent.inaxes)]
+ series = event.line.get_label()
+ data_point = spec.series_data[series][event.ind[0]]
+ node: PlanNode = data_point.node
+
+ modifiers = event.mouseevent.modifiers
+ if 'alt' in modifiers:
+ ptree = self.cm.get_node_plan_tree(node)
+ ann.set_text(PlanPrinter.build_plan_tree_str(ptree))
+ elif 'shift' in modifiers:
+ query = self.cm.get_node_query(node)
+ ann.set_text(f'{query.query_hash}\n{query.query}')
+ else:
+ ann.set_text('\n'.join([
+ series,
+ *self.cm.wrap_expr(str(node), 72),
+ node.get_estimate_str(), node.get_actual_str(),
+ f'prefix gaps={self.cm.get_index_key_prefix_gaps(node)}',
+ ]))
+
+ ann.xy = event.artist.get_xydata()[event.ind][0]
+ ann.xyann = ((event.axx - 0.5)*(-200) - 120, (event.axy - 0.5)*(-200) + 40)
+
+ ann.set_visible(True)
+ fig.canvas.draw_idle()
+
+ def on_button_release(event):
+ if 'cmd' not in event.modifiers:
+ hide_all_annotations()
+
+ def hide_all_annotations():
+ redraw = False
+ for ann in anns.values():
+ redraw |= ann.get_visible()
+ ann.set_visible(False)
+ if redraw:
+ fig.canvas.draw_idle()
+
+ anns = dict()
+ for ax in axs:
+ anns[id(ax)] = ax.annotate("", xy=(0, 0),
+ textcoords="offset points", xytext=(0, 0),
+ bbox=dict(boxstyle="round", fc="w"),
+ arrowprops=dict(arrowstyle="->"))
+ ann = anns[id(ax)]
+ ann.set_wrap(True)
+ ann.set_zorder(8)
+
+ hide_all_annotations()
+ fig.canvas.mpl_connect('pick_event', on_pick)
+ fig.canvas.mpl_connect('button_release_event', on_button_release)
+ plt.show()
diff --git a/src/actions/reports/cost_chart_specs.py b/src/actions/reports/cost_chart_specs.py
new file mode 100644
index 00000000..fef4e249
--- /dev/null
+++ b/src/actions/reports/cost_chart_specs.py
@@ -0,0 +1,624 @@
+import re
+
+from collections import namedtuple
+from collections.abc import Callable, Iterable, Mapping
+from copy import deepcopy
+from dataclasses import dataclass, field
+from enum import Enum
+
+from objects import PlanNode, ScanNode
+from actions.reports.cost_metrics import CostMetrics
+
+
+DataPoint = namedtuple('DataPoint', ['x', 'cost', 'time_ms', 'node'])
+
+
+class PlotType(Enum):
+ BOXPLOT = 0
+ X_TIME_COST_PLOT = 1
+
+
+@dataclass(frozen=True)
+class ChartOptions:
+ adjust_cost_by_actual_rows: bool = False
+ multipy_by_nloops: bool = False
+ log_scale_x: bool = False
+ log_scale_cost: bool = False
+ log_scale_time: bool = False
+
+ bp_show_fliers: bool = True # boxplot only
+
+ def __str__(self):
+ return ','.join(filter(lambda a: getattr(self, a), self.__dict__.keys()))
+
+
+@dataclass
+class ChartSpec:
+ plotter: PlotType
+ title: str
+ description: str
+ xlabel: str
+ ylabel1: str
+ ylabel2: str
+ query_filter: Callable[[str], bool]
+ node_filter: Callable[[PlanNode], bool]
+ x_getter: Callable
+ series_suffix: Callable = lambda node: ''
+ options: ChartOptions = field(default_factory=ChartOptions)
+
+ xtra_query_filter_list: Iterable[Callable[[str], bool]] = field(default_factory=list)
+ xtra_node_filter_list: Iterable[Callable[[PlanNode], bool]] = field(default_factory=list)
+
+ file_name: str = ''
+ queries: set[str] = field(default_factory=set)
+ series_data: Mapping[str: Iterable[DataPoint]] = field(default_factory=dict)
+ series_format: Mapping[str: str] = field(default_factory=dict)
+ outliers: Mapping[str: Iterable[DataPoint]] = field(default_factory=dict)
+ outlier_axis: str = 'cost/time ratio'
+
+ def is_boxplot(self):
+ return self.plotter is PlotType.BOXPLOT
+
+ def make_variant(self, xtra_title, overwrite_title=False,
+ description: str = None,
+ xtra_query_filter: Callable[[str], bool] = None,
+ xtra_node_filter: Callable[[PlanNode], bool] = None,
+ xlabel: str = None,
+ x_getter: Callable = None,
+ series_suffix: str = None,
+ options: ChartOptions = None):
+ var = deepcopy(self)
+ if overwrite_title:
+ var.title = xtra_title
+ else:
+ var.title += f' ({xtra_title})'
+ if description:
+ var.description = description
+ if xtra_query_filter:
+ var.xtra_query_filter_list.append(xtra_query_filter)
+ if xtra_node_filter:
+ var.xtra_node_filter_list.append(xtra_node_filter)
+ if xlabel:
+ var.xlabel = xlabel
+ if x_getter:
+ var.x_getter = x_getter
+ if series_suffix:
+ var.series_suffix = series_suffix
+ if options:
+ var.options = options
+ return var
+
+ def test_query(self, query_str):
+ return all(f(query_str) for f in [self.query_filter, *self.xtra_query_filter_list])
+
+ def test_node(self, node):
+ return all(f(node) for f in [self.node_filter, *self.xtra_node_filter_list])
+
+
+@dataclass
+class ChartGroup:
+ title: str
+ description: str
+ chart_specs: Iterable[ChartSpec]
+
+
+class CostChartSpecs:
+ def __init__(self, cm: CostMetrics):
+ self.dist_specs = self.__make_dist_specs(cm)
+ self.primitive_metric_specs = self.__make_primitive_metric_specs(cm)
+ self.simple_index_scan_specs = self.__make_simple_index_scan_specs(cm)
+ self.literal_in_list_specs = self.__make_literal_in_list_specs(cm)
+ self.bnl_in_list_specs = self.__make_bnl_in_list_specs(cm)
+ self.composite_key_access_specs = self.__make_composite_key_access_specs(cm)
+ self.more_exp_specs = self.__make_more_exp_specs(cm)
+
+ self.dist_chart_groups = [
+ ChartGroup(
+ "Time & cost distribution of scan nodes without any local filtering",
+ '',
+ self.dist_specs,
+ ),
+ ]
+
+ if cm.model == 'cost-validation-primitive-metrics':
+ self.xtc_chart_groups = [
+ ChartGroup(
+ "Column/Value Position and Column Count",
+ ("t1000000cN: 1,000,000 row table with N unique integer columns\n\n"
+ "t100000wX: 100,000 row table with a X character varchar column\n\n"),
+ self.primitive_metric_specs,
+ ),
+ ]
+ self.exp_chart_groups = list()
+ self.more_exp_chart_groups = list()
+ else:
+ self.xtc_chart_groups = [
+ ChartGroup(
+ "Simple Index Access Conditions",
+ 'Index scans with simple index access conditions and corresponding seq scans',
+ self.simple_index_scan_specs,
+ ),
+ ChartGroup(
+ "Index scan nodes with literal IN-list",
+ '',
+ self.literal_in_list_specs,
+ ),
+ ChartGroup(
+ "Index scan nodes with parameterized IN-list created by BNL",
+ '',
+ self.bnl_in_list_specs,
+ ),
+ ]
+
+ self.exp_chart_groups = [
+ ChartGroup(
+ "Experimental Charts",
+ '',
+ self.composite_key_access_specs,
+ ),
+ ]
+
+ self.more_exp_chart_groups = [
+ ChartGroup(
+ "More Experimental Charts",
+ '',
+ self.more_exp_specs,
+ ),
+ ]
+
+ def get_dist_chart_specs(self):
+ return self.dist_specs
+
+ def get_xtc_chart_specs(self):
+ return (self.primitive_metric_specs
+ + self.simple_index_scan_specs
+ + self.literal_in_list_specs
+ + self.bnl_in_list_specs)
+
+ def get_exp_chart_specs(self):
+ return self.composite_key_access_specs
+
+ def get_more_exp_chart_specs(self):
+ return self.more_exp_specs
+
+ @staticmethod
+ def __make_dist_specs(cm: CostMetrics) -> Iterable[ChartSpec]:
+ return [
+ (boxplot_simple_scan_node := ChartSpec(
+ PlotType.BOXPLOT,
+ '',
+ ('* Nodes with local filter, recheck-removed-rows or partial aggregate'
+ ' are _excluded_\n'),
+ 'xlabel', 'Node type', '',
+ lambda query: True,
+ lambda node: (
+ float(node.nloops) == 1
+ and float(node.rows) >= 1
+ and cm.get_node_width(node) == 0
+ and cm.has_no_local_filtering(node)
+ and not cm.has_partial_aggregate(node)
+ ),
+ x_getter=lambda node: 1,
+ options=ChartOptions(adjust_cost_by_actual_rows=True),
+ )).make_variant(
+ 'Per row cost/time ratio of the scan nodes (width=0, rows>=1)', True,
+ (' ((total_cost - startup_cost) / estimated_rows)'
+ ' / ((total_time - startup_time) / actual_rows)\n'
+ '\n* Nodes with local filter, recheck-removed-rows or partial aggregate'
+ ' are _excluded_\n'),
+ xlabel='Per row cost/time ratio [1/ms]',
+ x_getter=lambda node: (cm.get_per_row_cost(node)
+ / (cm.get_per_row_time(node) or 0.01)),
+ ),
+ boxplot_simple_scan_node.make_variant(
+ 'Per row time of the scan nodes (width=0, rows>=1)', True,
+ xlabel='Per row execution time[ms]',
+ x_getter=lambda node: cm.get_per_row_time(node),
+ ),
+ boxplot_simple_scan_node.make_variant(
+ 'Per row cost of the scan nodes (width=0, rows>=1)', True,
+ xlabel='Per row cost',
+ x_getter=lambda node: cm.get_per_row_cost(node),
+ ),
+ boxplot_simple_scan_node.make_variant(
+ 'Startup cost/time ratio of the scan nodes (width=0)', True,
+ xlabel='Startup cost/time ratio [1/ms]',
+ x_getter=lambda node: (float(node.startup_cost)
+ / (float(node.startup_ms)
+ if float(node.startup_ms) else 0.001)),
+ ),
+ boxplot_simple_scan_node.make_variant(
+ 'Startup time of the scan nodes (width=0)', True,
+ xlabel='Startup time [ms]',
+ x_getter=lambda node: float(node.startup_ms),
+ ),
+ ]
+
+ @staticmethod
+ def __make_primitive_metric_specs(cm: CostMetrics) -> Iterable[ChartSpec]:
+ single_table_query_basic_scans = ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ 'Basic scan node from single table queries', '',
+ 'X', 'Estimated cost', 'Execution time [ms]',
+ lambda query: (
+ cm.is_single_table_query(query)
+ and not cm.has_aggregate(query)
+ ),
+ lambda node: (
+ isinstance(node, ScanNode)
+ and not cm.has_partial_aggregate(node)
+ ),
+ x_getter=lambda node: 0,
+ )
+
+ value_type_map = {
+ "t1000000d10": "decimal",
+ "t1000000d20": "decimal",
+ "t1000000d30": "decimal",
+ "t1000000d40": "decimal",
+ "t1000000d50": "decimal",
+ "t1000000d60": "decimal",
+ "t1000000d70": "decimal",
+ "t1000000d80": "decimal",
+ "t1000000d90": "decimal",
+ "t1000000d100": "decimal",
+ "t1000000i": "int",
+ "t1000000bi": "int",
+ "t1000000flt": "float",
+ "t1000000dbl": "float",
+ }
+
+ return [
+ single_table_query_basic_scans.make_variant(
+ ('Varying column width, select 0 or select v from single column table'
+ ' (character type in 100000 row tables'),
+ xtra_node_filter=lambda node: (
+ re.match(r't100000w\d+k*', node.table_name)
+ and cm.has_no_condition(node)
+ ),
+ xlabel='Column width',
+ x_getter=lambda node: cm.get_table_row_width(node.table_name),
+ series_suffix=lambda node: cm.get_node_query(node).query[0:len('select X')],
+ ),
+ single_table_query_basic_scans.make_variant(
+ ('Varying column width, select v from single column table'
+ ' (numeric type variants in 1000000 row tables)'),
+ xtra_query_filter=lambda query: (
+ query.startswith('select v from')
+ ),
+ xtra_node_filter=lambda node: (
+ re.match(r't1000000(([d]\d+)|i|bi|flt|dbl)', node.table_name)
+ and cm.has_no_condition(node)
+ ),
+ xlabel='Column width',
+ x_getter=lambda node: cm.get_table_row_width(node.table_name),
+ series_suffix=lambda node: value_type_map[node.table_name],
+ ),
+ single_table_query_basic_scans.make_variant(
+ 'Varying column count in table, select 0, no condition',
+ xtra_query_filter=lambda query: query.startswith('select 0 from'),
+ xtra_node_filter=lambda node: (
+ re.match(r't1000000c\d+', node.table_name)
+ and cm.has_no_condition(node)
+ ),
+ xlabel='Column count',
+ x_getter=lambda node: cm.get_table_column_count(node.table_name),
+ ),
+ single_table_query_basic_scans.make_variant(
+ 'Varying column count in remote filter, select 0',
+ xtra_query_filter=lambda query: (
+ re.match(r'select 0 from t1000000c10 where [c0-9+ ]+ = +500000', query)
+ ),
+ xlabel='Column count',
+ x_getter=lambda node: (
+ cm.has_only_scan_filter_condition(node)
+ and len(cm.get_columns_in_query(cm.get_node_query_str(node)))
+ ),
+ series_suffix=lambda node: (
+ ''.join([
+ '(remote filter)' if cm.has_only_scan_filter_condition(node) else ''
+ ])),
+ ),
+ single_table_query_basic_scans.make_variant(
+ 'Varying column count in select-list',
+ xtra_node_filter=lambda node: (
+ node.table_name == 't1000000c10'
+ and cm.has_no_condition(node)
+ and (len(cm.get_columns_in_query(cm.get_node_query_str(node))) > 1
+ or (cm.get_columns_in_query(cm.get_node_query_str(node))
+ in (['c4'], ['c5']))) # a column in the middle only
+ ),
+ xlabel='Column count',
+ x_getter=lambda node: len(cm.get_columns_in_query(cm.get_node_query_str(node))),
+ ),
+
+ single_table_query_basic_scans.make_variant(
+ 'Varying position of column in select-list only, no condition)', True,
+ xtra_node_filter=lambda node: (
+ node.table_name == 't1000000c10'
+ and cm.get_node_width(node) == 4
+ and cm.has_no_condition(node)
+ ),
+ xlabel='Column position',
+ x_getter=lambda node: cm.get_single_column_query_column_position(node),
+ ),
+ single_table_query_basic_scans.make_variant(
+ 'Varying position of column in select-list and condition', True,
+ xtra_query_filter=lambda query: (
+ re.match(r'select c\d+ from t1000000c10 where c\d+ = +1000000 / 2', query)
+ ),
+ xlabel='Column position',
+ x_getter=lambda node: cm.get_single_column_query_column_position(node),
+ ),
+ single_table_query_basic_scans.make_variant(
+ 'Varying value position in index lookup via normalizd value', True,
+ xtra_query_filter=lambda query: (
+ True
+ ),
+ xtra_node_filter=lambda node: (
+ node.table_name == 't1000000c10'
+ and cm.get_node_width(node) == 4
+ and not cm.has_no_condition(node)
+ and cm.has_only_simple_condition(node,
+ index_cond_only=True,
+ index_key_prefix_only=True)
+ and cm.get_single_column_node_normalized_eq_cond_value(node) is not None
+ ),
+ xlabel='Normalized value',
+ x_getter=lambda node: cm.get_single_column_node_normalized_eq_cond_value(node),
+ series_suffix=lambda node: node.index_name or '',
+ ),
+ ]
+
+ @staticmethod
+ def __make_simple_index_scan_specs(cm: CostMetrics) -> Iterable[ChartSpec]:
+ return [
+ (chart_simple_index_scan := ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ ('Index scans with simple index access conditions and corresponding seq scans'),
+ '',
+ 'Output row count', 'Estimated cost', 'Execution time [ms]',
+ lambda query: (
+ cm.is_single_table_query(query)
+ and cm.has_no_filter_indexscan(query)
+ and not cm.has_local_filter(query)
+ and not cm.has_aggregate(query)
+ ),
+ lambda node: (
+ cm.has_no_local_filtering(node)
+ and not cm.has_no_condition(node)
+ and (node.is_seq_scan
+ or (node.is_any_index_scan
+ and cm.has_only_simple_condition(node, index_cond_only=True)))
+ ),
+ x_getter=lambda node: float(node.rows),
+ series_suffix=(lambda node:
+ f'{node.index_name or node.table_name}:'
+ f'width={cm.get_node_width(node)}'),
+ options=ChartOptions(adjust_cost_by_actual_rows=True),
+ )).make_variant(
+ 'Table t100000 and t100000w, series by node type', True,
+ xtra_query_filter=lambda query: 't100000 ' in query or 't100000w ' in query,
+ xtra_node_filter=(lambda node:
+ float(cm.get_table_row_count(node.table_name)) == 100000),
+ series_suffix=(lambda node:
+ f'{node.table_name}:width={cm.get_node_width(node)}'),
+ ),
+ chart_simple_index_scan.make_variant(
+ 'Table t100000, series by index', True,
+ xtra_query_filter=lambda query: 't100000 ' in query,
+ xtra_node_filter=(lambda node:
+ float(cm.get_table_row_count(node.table_name)) == 100000),
+ ),
+ chart_simple_index_scan.make_variant(
+ 'Table t100000w, series by index', True,
+ xtra_query_filter=lambda query: 't100000w ' in query,
+ xtra_node_filter=(lambda node:
+ float(cm.get_table_row_count(node.table_name)) == 100000),
+ ),
+ chart_simple_index_scan.make_variant(
+ 'Table t10000, series by index', True,
+ xtra_query_filter=lambda query: 't10000 ' in query,
+ xtra_node_filter=(lambda node:
+ float(cm.get_table_row_count(node.table_name)) == 10000),
+ ),
+ chart_simple_index_scan.make_variant(
+ 'Table t1000, series by index', True,
+ xtra_query_filter=lambda query: 't1000 ' in query,
+ xtra_node_filter=(lambda node:
+ float(cm.get_table_row_count(node.table_name)) == 1000),
+ ),
+ chart_simple_index_scan.make_variant(
+ 'Table t100, series by index', True,
+ xtra_query_filter=lambda query: 't100 ' in query,
+ xtra_node_filter=(lambda node:
+ float(cm.get_table_row_count(node.table_name)) == 100),
+ ),
+ ]
+
+ @staticmethod
+ def __make_literal_in_list_specs(cm: CostMetrics) -> Iterable[ChartSpec]:
+ return [
+ chart_single_literal_in_list := ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ 'Index scan nodes with single literal IN-list',
+ '',
+ 'Output row count', 'Estimated cost', 'Execution time [ms]',
+ lambda query: True,
+ lambda node: (
+ cm.has_literal_inlist_index_cond(node, single_in_list_only=True)
+ and cm.has_no_local_filtering(node)
+ ),
+ x_getter=lambda node: float(node.rows),
+ series_suffix=(lambda node:
+ ''.join([
+ f'{node.index_name or node.table_name}:',
+ f'width={cm.get_node_width(node)}',
+ ' ncInItems=',
+ cm.build_non_contiguous_literal_inlist_count_str(
+ node.table_name, node.get_index_cond()),
+ ])),
+ ),
+ chart_literal_in_list := ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ ('Index scan nodes with literal IN-list'
+ '- 1 or 2 IN-lists, or an IN-list and a simple index access condition'),
+ '',
+ 'Output row count', 'Estimated cost', 'Execution time [ms]',
+ lambda query: True,
+ lambda node: (
+ cm.has_literal_inlist_index_cond(node)
+ and cm.has_no_local_filtering(node)
+ ),
+ x_getter=lambda node: float(node.rows),
+ series_suffix=(lambda node:
+ ''.join([
+ f'{node.index_name or node.table_name}:',
+ f'width={cm.get_node_width(node)}',
+ ' ncInItems=',
+ cm.build_non_contiguous_literal_inlist_count_str(
+ node.table_name, node.get_index_cond()),
+ ])),
+ ),
+ chart_single_literal_in_list.make_variant(
+ "output <= 100 rows",
+ xtra_node_filter=lambda node: float(node.rows) <= 100,
+ ),
+ chart_literal_in_list.make_variant(
+ "output <= 100 rows",
+ xtra_node_filter=lambda node: float(node.rows) <= 100,
+ ),
+ ]
+
+ @staticmethod
+ def __make_bnl_in_list_specs(cm: CostMetrics) -> Iterable[ChartSpec]:
+ return [
+ ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ 'Parameterized IN-list index scans (BNL)',
+ '',
+ 'Output row count', 'Estimated cost', 'Execution time [ms]',
+ lambda query: True,
+ lambda node: (
+ cm.has_bnl_inlist_index_cond(node)
+ and cm.has_no_local_filtering(node)
+ ),
+ x_getter=lambda node: float(node.rows),
+ series_suffix=(lambda node:
+ f'{node.index_name}:width={cm.get_node_width(node)}'
+ f' loops={node.nloops}'
+ ),
+ options=ChartOptions(adjust_cost_by_actual_rows=True),
+ ),
+ ]
+
+ @staticmethod
+ def __make_composite_key_access_specs(cm: CostMetrics) -> Iterable[ChartSpec]:
+ return [
+ chart_composite_key := ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ 'Composite key index scans',
+ '',
+ 'Output row count', 'Estimated cost', 'Execution time [ms]',
+ lambda query: 't1000000m' in query or 't1000000c10' in query,
+ lambda node: (
+ cm.has_no_local_filtering(node)
+ and cm.has_only_simple_condition(node, index_cond_only=True)
+ ),
+ x_getter=lambda node: float(node.rows),
+ series_suffix=lambda node: f'{node.index_name}',
+ ),
+ chart_composite_key.make_variant(
+ 'output rows <= 100',
+ xtra_node_filter=lambda node: float(node.rows) <= 100,
+ ),
+ chart_composite_key.make_variant(
+ 'output rows <= 100, x=output_row_count x key_prefix_gaps',
+ description=(
+ "Index key prefix gaps: NDV of the keys before the first equality condition."
+ "\n\ne.g.: for index key `(c3, c4, c5)`,"
+ " condition: `c4 >= x and c5 = y` then the prefix NDV would be:"
+ " `select count(*) from (select distinct c3, c4 from t where c4 >= x) v;`"),
+ xtra_node_filter=lambda node: float(node.rows) <= 100,
+ xlabel='Output row count x key prefix gaps',
+ x_getter=lambda node: float(node.rows) * cm.get_index_key_prefix_gaps(node),
+ ),
+ chart_composite_key.make_variant(
+ 'key_prefix_gaps in series criteria',
+ description=(
+ "Index key prefix gaps: NDV of the keys before the first equality condition."
+ "\n\ne.g.: for index key `(c3, c4, c5)`,"
+ " condition: `c4 >= x and c5 = y` then the prefix NDV would be:"
+ " `select count(*) from (select distinct c3, c4 from t where c4 >= x) v;`"),
+ series_suffix=lambda node: (f'{node.index_name}'
+ ' gaps={cm.get_index_key_prefix_gaps(node)}'),
+ ),
+ ]
+
+ @staticmethod
+ def __make_more_exp_specs(cm: CostMetrics) -> Iterable[ChartSpec]:
+ return [
+ ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ 'Scans with simple remote index and/or table filter',
+ "* Index (Only) Scans may or may not have index access condition as well.",
+ 'Output row count', 'Estimated cost', 'Execution time [ms]',
+ lambda query: cm.has_scan_filter_indexscan(query),
+ lambda node: (
+ cm.has_only_scan_filter_condition(node)
+ or cm.has_only_simple_condition(node, index_cond_only=True)
+ ),
+ x_getter=lambda node: float(node.rows),
+ series_suffix=lambda node: (
+ f'{node.index_name or node.table_name}'
+ f':width={cm.get_node_width(node)} loops={node.nloops}'
+ ),
+ ),
+ ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ 'Scans with remote filter(s)',
+ '',
+ 'Output row count', 'Estimated cost', 'Execution time [ms]',
+ lambda query: True,
+ lambda node: cm.has_only_simple_condition(node, index_cond_only=False),
+ x_getter=lambda node: float(node.rows),
+ series_suffix=(lambda node:
+ f'{node.index_name or node.table_name}'
+ f':width={cm.get_node_width(node)} loops={node.nloops}'),
+ ),
+ chart_agg_pushdown := ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ 'Full scan + agg push down by table rows',
+ ('Scan nodes from `select count(*) from ...` single table queries without'
+ ' any search conditions'
+ '\n\n* The costs are not adjusted'),
+ 'Table rows', 'Estimated cost', 'Execution time [ms]',
+ lambda query: cm.has_aggregate(query),
+ lambda node: (
+ cm.has_partial_aggregate(node)
+ and cm.has_no_local_filtering(node)
+ ),
+ x_getter=lambda node: float(cm.get_table_row_count(node.table_name)),
+ series_suffix=lambda node: f'{node.index_name or node.table_name}',
+ options=ChartOptions(adjust_cost_by_actual_rows=False),
+ ),
+ chart_agg_pushdown.make_variant(
+ 'log scale',
+ options=ChartOptions(adjust_cost_by_actual_rows=False,
+ log_scale_x=True,
+ log_scale_cost=True,
+ log_scale_time=True),
+ ),
+ ChartSpec(
+ PlotType.X_TIME_COST_PLOT,
+ 'No filter full scans by output row x width',
+ '* need to adjust series grouping and query/node selection',
+ 'Output rows x width', 'Estimated cost', 'Execution time [ms]',
+ lambda query: True,
+ lambda node: (node.has_no_filter()
+ and not node.get_index_cond()
+ and not cm.has_partial_aggregate(node)),
+ x_getter=lambda node: float(node.rows) * cm.get_node_width(node),
+ series_suffix=lambda node: f'{node.index_name or node.table_name}',
+ ),
+ ]
diff --git a/src/actions/reports/cost_metric_metadata.py b/src/actions/reports/cost_metric_metadata.py
new file mode 100644
index 00000000..8b973f92
--- /dev/null
+++ b/src/actions/reports/cost_metric_metadata.py
@@ -0,0 +1,551 @@
+from collections.abc import Mapping
+
+
+class TableStats:
+ def __init__(self, ncols, width):
+ self.ncols = ncols
+ self.width = width
+
+
+class ColumnStats:
+ def __init__(self, ndv, vmin, vmax, null_frac):
+ self.ndv = ndv
+ self.vmin = vmin
+ self.vmax = vmax
+ self.null_frac = null_frac
+
+ def get_avg_value_distance(self):
+ return (self.vmax - self.vmin) / (self.ndv - 1)
+
+ def normalize_value(self, value):
+ d = self.get_avg_value_distance()
+ return (value - self.vmin + d) / (self.vmax - self.vmin + d)
+
+
+# TODO: move these to data collection and save into .json and combine them with
+# CostMetrics.table_row_map and CostMetrics.column_position_map.
+# e.g.:
+# select
+# ' "'||tablename||'": TableStats('||count(0)||', '||sum(avg_width)||'),'
+# from pg_stats
+# where tablename in (
+# 't1000000c01', 't1000000c02', 't1000000c03', 't1000000c04', 't1000000c05',
+# 't1000000c06', 't1000000c07', 't1000000c08', 't1000000c09', 't1000000c10',
+# 't100000w125', 't100000w250', 't100000w500', 't100000w1k', 't100000w2k',
+# 't100000w3k', 't100000w4k', 't100000w5k', 't100000w6k', 't100000w7k',
+# 't100000w8k',
+# 't1000000d10','t1000000d20','t1000000d30','t1000000d40','t1000000d50',
+# 't1000000d60','t1000000d70','t1000000d80','t1000000d90','t1000000d100',
+# 't1000000i','t1000000bi','t1000000flt','t1000000dbl'
+# )
+# group by tablename
+# order by 1;
+table_stats_map: Mapping[str: TableStats] = dict({
+ "t1000000bi": TableStats(1, 8),
+ "t1000000c01": TableStats(1, 4),
+ "t1000000c02": TableStats(2, 8),
+ "t1000000c03": TableStats(3, 12),
+ "t1000000c04": TableStats(4, 16),
+ "t1000000c05": TableStats(5, 20),
+ "t1000000c06": TableStats(6, 24),
+ "t1000000c07": TableStats(7, 28),
+ "t1000000c08": TableStats(8, 32),
+ "t1000000c09": TableStats(9, 36),
+ "t1000000c10": TableStats(10, 40),
+ "t1000000d10": TableStats(1, 8),
+ "t1000000d100": TableStats(1, 54),
+ "t1000000d20": TableStats(1, 14),
+ "t1000000d30": TableStats(1, 18),
+ "t1000000d40": TableStats(1, 24),
+ "t1000000d50": TableStats(1, 28),
+ "t1000000d60": TableStats(1, 34),
+ "t1000000d70": TableStats(1, 38),
+ "t1000000d80": TableStats(1, 44),
+ "t1000000d90": TableStats(1, 48),
+ "t1000000dbl": TableStats(1, 8),
+ "t1000000flt": TableStats(1, 4),
+ "t1000000i": TableStats(1, 4),
+ "t100000w125": TableStats(1, 126),
+ "t100000w1k": TableStats(1, 1004),
+ "t100000w250": TableStats(1, 254),
+ "t100000w2k": TableStats(1, 2004),
+ "t100000w3k": TableStats(1, 3004),
+ "t100000w4k": TableStats(1, 4004),
+ "t100000w500": TableStats(1, 504),
+ "t100000w5k": TableStats(1, 5004),
+ "t100000w6k": TableStats(1, 6004),
+ "t100000w7k": TableStats(1, 7004),
+ "t100000w8k": TableStats(1, 8004),
+})
+
+#
+# select
+# 'select'
+# ||' '' "'||relname||'.'||attname||'": ColumnStats(''||count(distinct v)||'', '''
+# ||' ||min(v)||'', ''||max(v)||'', ''||(count(*) - count(v))/count(*)||''),'''
+# ||'from (select '||attname||' as v from '||relname||') vv;'
+# from
+# pg_namespace nc
+# join pg_class c on nc.oid = relnamespace
+# join pg_attribute a on attrelid = c.oid
+# where
+# relkind = 'r'
+# and attnum >= 0
+# and nspname in ('public')
+# order by
+# nspname,
+# relname,
+# attnum;
+#
+column_stats_map: Mapping[str: ColumnStats] = dict({
+ "t1000000m.c0": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000m.c1": ColumnStats(2, 50, 100, 0),
+ "t1000000m.c2": ColumnStats(4, 25, 100, 0),
+ "t1000000m.c3": ColumnStats(10, 10, 100, 0),
+ "t1000000m.c4": ColumnStats(50, 2, 100, 0),
+ "t1000000m.c5": ColumnStats(100000, 1, 100000, 0),
+ "t1000000m.c6": ColumnStats(10000, 1, 10000, 0),
+ "t1000000c10.c0": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000c10.c1": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000c10.c2": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000c10.c3": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000c10.c4": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000c10.c5": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000c10.c6": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000c10.c7": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000c10.c8": ColumnStats(1000000, 1, 1000000, 0),
+ "t1000000c10.c9": ColumnStats(1000000, 1, 1000000, 0),
+})
+
+index_prefix_gap_map: Mapping[str: int] = dict({
+ ("t1000000m_c1c2c3c4:((c1 <= 50) AND (c2 = 50) AND (c3 = 50))"): 1,
+ ("t1000000m_c1c2c3c4:((c1 <= 50) AND (c2 = 50))"): 1,
+ ("t1000000m_c1c2c3c4:((c1 = 50) AND (c2 <= 50) AND (c3 = 50))"): 2,
+ ("t1000000m_c1c2c3c4:((c1 = 50) AND (c3 = 50))"): 4,
+ ("t1000000m_c1c2c3c4:((c2 <= 50) AND (c3 = 50) AND (c4 = 50))"): 4,
+ ("t1000000m_c1c2c3c4:((c2 = 50) AND (c3 = 50) AND (c4 <= 50))"): 2,
+ ("t1000000m_c1c2c3c4:((c2 = 50) AND (c3 = 50) AND (c4 = 50))"): 2,
+ ("t1000000m_c1c2c3c4:((c2 = 50) AND (c3 = 50))"): 2,
+ ("t1000000m_c1c2c3c4:((c2 = 50) AND (c4 <= 50))"): 20,
+ ("t1000000m_c1c2c3c4:((c2 = 50) AND (c4 = 50))"): 20,
+ ("t1000000m_c1c2c3c4:((c3 = 50) AND (c4 = 50))"): 8,
+ ("t1000000m_c1c2c3c4:(c2 <= 50)"): 2,
+ ("t1000000m_c1c2c3c4:(c2 = 50)"): 2,
+ ("t1000000m_c1c2c3c4:(c3 <= 50)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 10)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 100)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 20)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 30)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 40)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 50)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 60)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 70)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 80)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = 90)"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,20,30,40,50,60,70,80,90,100}'::"
+ "integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,20,30,40,50,60,70,80,90}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,20,30,40,50,60,70,80}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,20,30,40,50,60,70}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,20,30,40,50,60}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,20,30,40,50}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,20,30,40}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,20,30}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,20}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,30,50,70,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{10,50,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{20,30,40,50,60,70,80,90,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{30,40,50,60,70,80,90,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{40,50,60,70,80,90,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{50,60,70,80,90,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{60,70,80,90,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{70,80,90,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{80,90,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c3 = ANY ('{90,100}'::integer[]))"): 8,
+ ("t1000000m_c1c2c3c4:(c4 <= 1)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 10)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 100)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 2)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 20)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 30)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 40)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 50)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 60)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 70)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 80)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 <= 90)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 100)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 12)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 2)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 24)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 34)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 44)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 50)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 56)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 66)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 78)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = 88)"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = ANY ('{2,100}'::integer[]))"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = ANY ('{2,12,24,34,44,56,66,78,88,100}'::integer[]))"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = ANY ('{2,14,26,38,50,64,76,88,100}'::integer[]))"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = ANY ('{2,16,30,44,58,72,86,100}'::integer[]))"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = ANY ('{2,18,34,38,50,66,84,100}'::integer[]))"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = ANY ('{2,20,40,60,80,100}'::integer[]))"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = ANY ('{2,26,50,74,100}'::integer[]))"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = ANY ('{2,34,66,100}'::integer[]))"): 80,
+ ("t1000000m_c1c2c3c4:(c4 = ANY ('{2,50,100}'::integer[]))"): 80,
+ ("t1000000m_c3c4c5:((c4 <= 10) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 50,
+ ("t1000000m_c3c4c5:((c4 <= 100) AND (c5 = ANY ('{1,1011,2021,3031,4041,"
+ "5051,6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 500,
+ ("t1000000m_c3c4c5:((c4 <= 2) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 <= 20) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 100,
+ ("t1000000m_c3c4c5:((c4 <= 30) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 150,
+ ("t1000000m_c3c4c5:((c4 <= 40) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 200,
+ ("t1000000m_c3c4c5:((c4 <= 50) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 250,
+ ("t1000000m_c3c4c5:((c4 <= 60) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 300,
+ ("t1000000m_c3c4c5:((c4 <= 70) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 350,
+ ("t1000000m_c3c4c5:((c4 <= 80) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 400,
+ ("t1000000m_c3c4c5:((c4 <= 90) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 450,
+ ("t1000000m_c3c4c5:((c4 = 50) AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,"
+ "6061,7071,8081,9091,10101,11112,12122,13132,14142,15152,16162,17172,"
+ "18182,19192,20202,21212,22223,23233,24243,25253,26263,27273,28283,29293,"
+ "30303,31313,32323,33334,34344,35354,36364,37374,38384,39394,40404,41414,"
+ "42424,43434,44445,45455,46465,47475,48485,49495,50505,51515,52525,53535,"
+ "54545,55556,56566,57576,58586,59596,60606,61616,62626,63636,64646,65656,"
+ "66667,67677,68687,69697,70707,71717,72727,73737,74747,75757,76767,77778,"
+ "78788,79798,80808,81818,82828,83838,84848,85858,86868,87878,88889,89899,"
+ "90909,91919,92929,93939,94949,95959,96969,97979,98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 = ANY ('{2,100}'::integer[])) AND (c5 = ANY ('{1,"
+ "1011,2021,3031,4041,5051,6061,7071,8081,9091,10101,11112,12122,13132,"
+ "14142,15152,16162,17172,18182,19192,20202,21212,22223,23233,24243,25253,"
+ "26263,27273,28283,29293,30303,31313,32323,33334,34344,35354,36364,37374,"
+ "38384,39394,40404,41414,42424,43434,44445,45455,46465,47475,48485,49495,"
+ "50505,51515,52525,53535,54545,55556,56566,57576,58586,59596,60606,61616,"
+ "62626,63636,64646,65656,66667,67677,68687,69697,70707,71717,72727,73737,"
+ "74747,75757,76767,77778,78788,79798,80808,81818,82828,83838,84848,85858,"
+ "86868,87878,88889,89899,90909,91919,92929,93939,94949,95959,96969,97979,"
+ "98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 = ANY ('{2,12,24,34,44,56,66,78,88,100}'::integer[])) "
+ "AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,6061,7071,8081,9091,10101,"
+ "11112,12122,13132,14142,15152,16162,17172,18182,19192,20202,21212,22223,"
+ "23233,24243,25253,26263,27273,28283,29293,30303,31313,32323,33334,34344,"
+ "35354,36364,37374,38384,39394,40404,41414,42424,43434,44445,45455,46465,"
+ "47475,48485,49495,50505,51515,52525,53535,54545,55556,56566,57576,58586,"
+ "59596,60606,61616,62626,63636,64646,65656,66667,67677,68687,69697,70707,"
+ "71717,72727,73737,74747,75757,76767,77778,78788,79798,80808,81818,82828,"
+ "83838,84848,85858,86868,87878,88889,89899,90909,91919,92929,93939,94949,"
+ "95959,96969,97979,98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 = ANY ('{2,14,26,38,50,64,76,88,100}'::integer[])) "
+ "AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,6061,7071,8081,9091,10101,"
+ "11112,12122,13132,14142,15152,16162,17172,18182,19192,20202,21212,22223,"
+ "23233,24243,25253,26263,27273,28283,29293,30303,31313,32323,33334,34344,"
+ "35354,36364,37374,38384,39394,40404,41414,42424,43434,44445,45455,46465,"
+ "47475,48485,49495,50505,51515,52525,53535,54545,55556,56566,57576,58586,"
+ "59596,60606,61616,62626,63636,64646,65656,66667,67677,68687,69697,70707,"
+ "71717,72727,73737,74747,75757,76767,77778,78788,79798,80808,81818,82828,"
+ "83838,84848,85858,86868,87878,88889,89899,90909,91919,92929,93939,94949,"
+ "95959,96969,97979,98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 = ANY ('{2,16,30,44,58,72,86,100}'::integer[])) "
+ "AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,6061,7071,8081,9091,10101,"
+ "11112,12122,13132,14142,15152,16162,17172,18182,19192,20202,21212,22223,"
+ "23233,24243,25253,26263,27273,28283,29293,30303,31313,32323,33334,34344,"
+ "35354,36364,37374,38384,39394,40404,41414,42424,43434,44445,45455,46465,"
+ "47475,48485,49495,50505,51515,52525,53535,54545,55556,56566,57576,58586,"
+ "59596,60606,61616,62626,63636,64646,65656,66667,67677,68687,69697,70707,"
+ "71717,72727,73737,74747,75757,76767,77778,78788,79798,80808,81818,82828,"
+ "83838,84848,85858,86868,87878,88889,89899,90909,91919,92929,93939,94949,"
+ "95959,96969,97979,98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 = ANY ('{2,18,34,38,50,66,84,100}'::integer[])) "
+ "AND (c5 = ANY ('{1,1011,2021,3031,4041,5051,6061,7071,8081,9091,10101,"
+ "11112,12122,13132,14142,15152,16162,17172,18182,19192,20202,21212,22223,"
+ "23233,24243,25253,26263,27273,28283,29293,30303,31313,32323,33334,34344,"
+ "35354,36364,37374,38384,39394,40404,41414,42424,43434,44445,45455,46465,"
+ "47475,48485,49495,50505,51515,52525,53535,54545,55556,56566,57576,58586,"
+ "59596,60606,61616,62626,63636,64646,65656,66667,67677,68687,69697,70707,"
+ "71717,72727,73737,74747,75757,76767,77778,78788,79798,80808,81818,82828,"
+ "83838,84848,85858,86868,87878,88889,89899,90909,91919,92929,93939,94949,"
+ "95959,96969,97979,98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 = ANY ('{2,20,40,60,80,100}'::integer[])) AND (c5 = "
+ "ANY ('{1,1011,2021,3031,4041,5051,6061,7071,8081,9091,10101,11112,12122,"
+ "13132,14142,15152,16162,17172,18182,19192,20202,21212,22223,23233,24243,"
+ "25253,26263,27273,28283,29293,30303,31313,32323,33334,34344,35354,36364,"
+ "37374,38384,39394,40404,41414,42424,43434,44445,45455,46465,47475,48485,"
+ "49495,50505,51515,52525,53535,54545,55556,56566,57576,58586,59596,60606,"
+ "61616,62626,63636,64646,65656,66667,67677,68687,69697,70707,71717,72727,"
+ "73737,74747,75757,76767,77778,78788,79798,80808,81818,82828,83838,84848,"
+ "85858,86868,87878,88889,89899,90909,91919,92929,93939,94949,95959,96969,"
+ "97979,98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 = ANY ('{2,26,50,74,100}'::integer[])) AND (c5 = "
+ "ANY ('{1,1011,2021,3031,4041,5051,6061,7071,8081,9091,10101,11112,12122,"
+ "13132,14142,15152,16162,17172,18182,19192,20202,21212,22223,23233,24243,"
+ "25253,26263,27273,28283,29293,30303,31313,32323,33334,34344,35354,36364,"
+ "37374,38384,39394,40404,41414,42424,43434,44445,45455,46465,47475,48485,"
+ "49495,50505,51515,52525,53535,54545,55556,56566,57576,58586,59596,60606,"
+ "61616,62626,63636,64646,65656,66667,67677,68687,69697,70707,71717,72727,"
+ "73737,74747,75757,76767,77778,78788,79798,80808,81818,82828,83838,84848,"
+ "85858,86868,87878,88889,89899,90909,91919,92929,93939,94949,95959,96969,"
+ "97979,98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 = ANY ('{2,34,66,100}'::integer[])) AND (c5 = ANY ('{"
+ "1,1011,2021,3031,4041,5051,6061,7071,8081,9091,10101,11112,12122,13132,"
+ "14142,15152,16162,17172,18182,19192,20202,21212,22223,23233,24243,25253,"
+ "26263,27273,28283,29293,30303,31313,32323,33334,34344,35354,36364,37374,"
+ "38384,39394,40404,41414,42424,43434,44445,45455,46465,47475,48485,49495,"
+ "50505,51515,52525,53535,54545,55556,56566,57576,58586,59596,60606,61616,"
+ "62626,63636,64646,65656,66667,67677,68687,69697,70707,71717,72727,73737,"
+ "74747,75757,76767,77778,78788,79798,80808,81818,82828,83838,84848,85858,"
+ "86868,87878,88889,89899,90909,91919,92929,93939,94949,95959,96969,97979,"
+ "98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:((c4 = ANY ('{2,50,100}'::integer[])) AND (c5 = ANY ('{"
+ "1,1011,2021,3031,4041,5051,6061,7071,8081,9091,10101,11112,12122,13132,"
+ "14142,15152,16162,17172,18182,19192,20202,21212,22223,23233,24243,25253,"
+ "26263,27273,28283,29293,30303,31313,32323,33334,34344,35354,36364,37374,"
+ "38384,39394,40404,41414,42424,43434,44445,45455,46465,47475,48485,49495,"
+ "50505,51515,52525,53535,54545,55556,56566,57576,58586,59596,60606,61616,"
+ "62626,63636,64646,65656,66667,67677,68687,69697,70707,71717,72727,73737,"
+ "74747,75757,76767,77778,78788,79798,80808,81818,82828,83838,84848,85858,"
+ "86868,87878,88889,89899,90909,91919,92929,93939,94949,95959,96969,97979,"
+ "98989,100000}'::integer[])))"): 10,
+ ("t1000000m_c3c4c5:(c4 <= 1)"): 10,
+ ("t1000000m_c3c4c5:(c4 <= 50)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 100)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 12)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 2)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 24)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 34)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 44)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 50)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 56)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 66)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 78)"): 10,
+ ("t1000000m_c3c4c5:(c4 = 88)"): 10,
+ ("t1000000m_c3c4c5:(c5 = 1)"): 500,
+ ("t1000000m_c3c4c5:(c5 = 100000)"): 500,
+ ("t1000000m_c3c4c5:(c5 = 10101)"): 500,
+ ("t1000000m_c3c4c5:(c5 = 20202)"): 500,
+ ("t1000000m_c3c4c5:(c5 = 30303)"): 500,
+ ("t1000000m_c3c4c5:(c5 = 40404)"): 500,
+ ("t1000000m_c3c4c5:(c5 = 50505)"): 500,
+ ("t1000000m_c3c4c5:(c5 = 60606)"): 500,
+ ("t1000000m_c3c4c5:(c5 = 70707)"): 500,
+ ("t1000000m_c3c4c5:(c5 = 80808)"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{1,1011,2021,3031,4041,5051,6061,7071,8081,"
+ "9091,10101,11112,12122,13132,14142,15152,16162,17172,18182,19192,20202,"
+ "21212,22223,23233,24243,25253,26263,27273,28283,29293,30303,31313,32323,"
+ "33334,34344,35354,36364,37374,38384,39394,40404,41414,42424,43434,44445,"
+ "45455,46465,47475,48485,49495,50505,51515,52525,53535,54545,55556,56566,"
+ "57576,58586,59596,60606,61616,62626,63636,64646,65656,66667,67677,68687,"
+ "69697,70707,71717,72727,73737,74747,75757,76767,77778,78788,79798,80808,"
+ "81818,82828,83838,84848,85858,86868,87878,88889,89899,90909,91919,92929,"
+ "93939,94949,95959,96969,97979,98989,100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{1,12500,25000,35700,50000,62500,75000,"
+ "87500,100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,"
+ "18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,"
+ "42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,"
+ "66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,"
+ "90,91,92,93,94,95,96,97,98,99,100}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{1,2,3,4,5,6,7,8,9,10}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{1,2,3,4,5}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{1,25000,50000,75000,100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{1,2}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99901,99902,99903,99904,99905,99906,99907,"
+ "99908,99909,99910,99911,99912,99913,99914,99915,99916,99917,99918,99919,"
+ "99920,99921,99922,99923,99924,99925,99926,99927,99928,99929,99930,99931,"
+ "99932,99933,99934,99935,99936,99937,99938,99939,99940,99941,99942,99943,"
+ "99944,99945,99946,99947,99948,99949,99950,99951,99952,99953,99954,99955,"
+ "99956,99957,99958,99959,99960,99961,99962,99963,99964,99965,99966,99967,"
+ "99968,99969,99970,99971,99972,99973,99974,99975,99976,99977,99978,99979,"
+ "99980,99981,99982,99983,99984,99985,99986,99987,99988,99989,99990,99991,"
+ "99992,99993,99994,99995,99996,99997,99998,99999,100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99991,99992,99993,99994,99995,99996,99997,"
+ "99998,99999,100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99992,99993,99994,99995,99996,99997,99998,"
+ "99999,100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99993,99994,99995,99996,99997,99998,99999,"
+ "100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99994,99995,99996,99997,99998,99999,100000}'::"
+ "integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99995,99996,99997,99998,99999,100000}'::"
+ "integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99996,99997,99998,99999,100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99997,99998,99999,100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99998,99999,100000}'::integer[]))"): 500,
+ ("t1000000m_c3c4c5:(c5 = ANY ('{99999,100000}'::integer[]))"): 500,
+ ("t1000000m_c4c2c3c1:((c2 <= 50) AND (c3 = 50) AND (c1 = 50))"): 100,
+ ("t1000000m_c4c2c3c1:((c2 = 50) AND (c1 <= 50))"): 500,
+ ("t1000000m_c4c2c3c1:((c2 = 50) AND (c1 = 50))"): 500,
+ ("t1000000m_c4c2c3c1:((c2 = 50) AND (c3 = 50) AND (c1 <= 50))"): 50,
+ ("t1000000m_c4c2c3c1:((c2 = 50) AND (c3 = 50) AND (c1 = 50))"): 50,
+ ("t1000000m_c4c2c3c1:((c2 = 50) AND (c3 = 50))"): 50,
+ ("t1000000m_c4c2c3c1:((c3 = 50) AND (c1 = 50))"): 200,
+ ("t1000000m_c4c2c3c1:((c4 <= 50) AND (c2 = 50) AND (c3 = 50))"): 25,
+ ("t1000000m_c4c2c3c1:((c4 <= 50) AND (c2 = 50))"): 25,
+ ("t1000000m_c4c2c3c1:((c4 = 50) AND (c2 <= 50) AND (c3 = 50))"): 2,
+ ("t1000000m_c4c2c3c1:((c4 = 50) AND (c3 = 50))"): 4,
+ ("t1000000m_c4c2c3c1:(c1 <= 50)"): 2000,
+ ("t1000000m_c4c2c3c1:(c1 = 50)"): 2000,
+ ("t1000000m_c4c2c3c1:(c2 <= 50)"): 50,
+ ("t1000000m_c4c2c3c1:(c2 = 50)"): 50,
+ ("t1000000m_c4c2c3c1:(c3 <= 50)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 10)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 100)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 20)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 30)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 40)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 50)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 60)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 70)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 80)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = 90)"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,20,30,40,50,60,70,80,90,100}'::"
+ "integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,20,30,40,50,60,70,80,90}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,20,30,40,50,60,70,80}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,20,30,40,50,60,70}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,20,30,40,50,60}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,20,30,40,50}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,20,30,40}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,20,30}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,20}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,30,50,70,100}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{10,50,100}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{20,30,40,50,60,70,80,90,100}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{30,40,50,60,70,80,90,100}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{40,50,60,70,80,90,100}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{50,60,70,80,90,100}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{60,70,80,90,100}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{70,80,90,100}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{80,90,100}'::integer[]))"): 200,
+ ("t1000000m_c4c2c3c1:(c3 = ANY ('{90,100}'::integer[]))"): 200,
+ ("t1000000m_c5c4c3:((c4 <= 50) AND (c3 = 50))"): 457403,
+ ("t1000000m_c5c4c3:((c4 = 50) AND (c3 = 50))"): 100000,
+ ("t1000000m_c5c4c3:(c3 <= 50)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 10)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 100)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 20)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 30)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 40)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 50)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 60)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 70)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 80)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = 90)"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,20,30,40,50,60,70,80,90,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,20,30,40,50,60,70,80,90}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,20,30,40,50,60,70,80}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,20,30,40,50,60,70}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,20,30,40,50,60}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,20,30,40,50}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,20,30,40}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,20,30}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,20}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,30,50,70,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{10,50,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{20,30,40,50,60,70,80,90,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{30,40,50,60,70,80,90,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{40,50,60,70,80,90,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{50,60,70,80,90,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{60,70,80,90,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{70,80,90,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{80,90,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c3 = ANY ('{90,100}'::integer[]))"): 914738,
+ ("t1000000m_c5c4c3:(c4 <= 50)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 100)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 12)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 2)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 24)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 34)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 44)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 50)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 56)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 66)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 78)"): 100000,
+ ("t1000000m_c5c4c3:(c4 = 88)"): 100000,
+})
diff --git a/src/actions/reports/cost_metrics.py b/src/actions/reports/cost_metrics.py
new file mode 100644
index 00000000..90259468
--- /dev/null
+++ b/src/actions/reports/cost_metrics.py
@@ -0,0 +1,728 @@
+import re
+
+from collections.abc import Iterable, Mapping
+from config import Config
+from dataclasses import dataclass
+from itertools import pairwise
+
+from objects import PlanNodeVisitor, PlanPrinter, Query
+from objects import AggregateNode, JoinNode, SortNode, PlanNode, ScanNode
+
+from actions.reports.cost_metric_metadata import (table_stats_map, column_stats_map,
+ index_prefix_gap_map)
+
+
+expr_classifier_pattern = re.compile(
+ r'[ (]*((\w+\.)*(?Pc\d+)[ )]* *(?P=|>=|<=|<>|<|>)'
+ r' *(?P(?P\d+)|(?:ANY \(\'{(?P[0-9,]+)}\'::integer\[\]\))'
+ r'|(?:ANY \((?PARRAY\[[$0-9a-z_,. ]+\])\))))'
+)
+
+in_list_item_extraction_pattern = re.compile(
+ r'\$(?P\d+)[ ,\$0-9]+..., \$(?P\d+)'
+)
+
+# assume cost-validation model naming convention for now:
+# : __
+# : [cv]\d*
+# | : {...}
+index_key_extraction_pattern = re.compile(
+ r'(?P\w[0-9a-z]+)_(?P\w[0-9a-z]+)(?:_(?P\w[0-9a-z]+))*'
+)
+packed_column_list_pattern = re.compile(r'[cv]\d*')
+
+
+class NodeClassifiers:
+ def __init__(self, node: PlanNode):
+ self.is_seq_scan = False
+ self.is_any_index_scan = False
+ self.is_join = False
+ self.is_aggregate = False
+ self.is_sort = False
+ self.has_index_access_cond = False
+ self.has_scan_filter = False
+ self.has_tfbr_filter = False
+ self.has_local_filter = False
+ self.has_rows_removed_by_recheck = False
+ self.has_no_condition = False
+ self.has_partial_aggregate = False
+
+ if isinstance(node, ScanNode):
+ self.is_seq_scan = node.is_seq_scan
+ self.is_any_index_scan = node.is_any_index_scan
+ self.has_index_access_cond = bool(node.get_index_cond())
+ self.has_scan_filter = bool(node.get_remote_filter())
+ self.has_tfbr_filter = bool(node.get_remote_tfbr_filter())
+ self.has_local_filter = bool(node.get_local_filter())
+ self.has_rows_removed_by_recheck = bool(node.get_rows_removed_by_recheck())
+ self.has_partial_aggregate = node.is_scan_with_partial_aggregate()
+ self.has_no_condition = not any([
+ self.has_index_access_cond,
+ self.has_scan_filter,
+ self.has_tfbr_filter,
+ self.has_local_filter,
+ ])
+ elif isinstance(node, JoinNode):
+ self.is_join = True
+ elif isinstance(node, AggregateNode):
+ self.is_aggregate = True
+ elif isinstance(node, SortNode):
+ self.is_sort = True
+
+ def __str__(self):
+ return ','.join(filter(lambda a: getattr(self, a), self.__dict__.keys()))
+
+
+@dataclass
+class PlanClassifiers:
+ is_single_table: bool = False
+ has_join: bool = False
+ has_aggregate: bool = False
+ has_sort: bool = False
+ has_key_access_index: bool = False
+ has_scan_filter_index: bool = False
+ has_tfbr_filter_index: bool = False
+ has_no_filter_index: bool = False
+ has_table_filter_seqscan: bool = False
+ has_local_filter: bool = False
+
+ # these need to be computed at the end
+ has_single_scan_node: bool = False
+ has_no_condition_scan: bool = False
+
+ def __str__(self):
+ return ','.join(filter(lambda a: getattr(self, a), self.__dict__.keys()))
+
+ def update(self, nc: NodeClassifiers):
+ self.has_join |= nc.is_join
+ self.has_aggregate |= nc.is_aggregate
+ self.has_sort |= nc.is_sort
+ self.has_table_filter_seqscan |= (nc.is_seq_scan and nc.has_scan_filter)
+ self.has_local_filter |= nc.has_local_filter
+ if nc.is_any_index_scan:
+ self.has_key_access_index |= nc.has_index_access_cond
+ self.has_scan_filter_index |= nc.has_scan_filter
+ self.has_tfbr_filter_index |= nc.has_tfbr_filter
+ self.has_no_filter_index |= not (self.has_scan_filter_index
+ or self.has_tfbr_filter_index
+ or self.has_local_filter)
+
+ def merge(self, other):
+ for a in self.__dict__.keys():
+ if getattr(other, a):
+ setattr(self, a, True)
+
+
+@dataclass(frozen=True)
+class PlanContext:
+ parent_query: Query
+ index: int
+ plan_tree: PlanNode
+
+ def get_query(self):
+ return self.parent_query.optimizations[self.index] if self.index else self.parent_query
+
+
+@dataclass(frozen=True)
+class NodeContext:
+ plan_context: PlanContext
+ node_width: int
+ node_classifiers: NodeClassifiers
+
+ def get_parent_query(self):
+ return self.plan_context.parent_query
+
+ def get_query(self):
+ return self.plan_context.get_query()
+
+ def get_plan_tree(self):
+ return self.plan_context.plan_tree
+
+
+@dataclass(frozen=True)
+class QueryContext:
+ query: Query
+ pc: PlanClassifiers
+
+ def get_columns(self):
+ return sorted(set(f.name for t in self.query.tables for f in t.fields))
+
+
+class ExpressionAnalyzer:
+ def __init__(self, expr):
+ self.expr = expr
+ self.columns: set[str] = set()
+ self.simple_comp_exprs: int = 0
+ self.literal_in_lists: int = 0
+ self.bnl_in_lists: int = 0
+ self.complex_exprs: int = 0
+ self.prop_list: Iterable[Mapping] = list()
+ self.__analyze()
+
+ def is_simple_expr(self):
+ return (len(self.columns) == 1
+ and self.simple_comp_exprs >= 1
+ and self.literal_in_lists == 0
+ and self.bnl_in_lists == 0
+ and self.complex_exprs == 0)
+
+ def has_key_prefix_cond_only(self, key_cols):
+ cols = set(self.columns) # make a copy
+ for kc in key_cols:
+ if kc in cols:
+ cols.remove(kc)
+ if not len(cols):
+ return True
+ return False
+
+ def __analyze(self):
+ if not self.expr or not self.expr.strip():
+ return list()
+ for branch in re.split(r'\bAND\b', self.expr):
+ if m := expr_classifier_pattern.search(branch):
+ if column := m.group('column'):
+ self.columns.add(column)
+ op = m.group('op')
+ rhs = m.group('rhs')
+ number = m.group('number')
+ self.simple_comp_exprs += bool(column and op and number)
+
+ num_list_items = None
+
+ if literal_array := m.group('lit_array'):
+ num_list_items = len(literal_array.split(','))
+ self.literal_in_lists += 1
+ bnl_array = None
+ elif bnl_array := m.group('bnl_array'):
+ self.bnl_in_lists += 1
+ num_list_items = self.__count_inlist_items(bnl_array)
+
+ self.prop_list.append(dict(column=column,
+ op=op,
+ rhs=rhs,
+ number=number,
+ num_list_items=num_list_items,
+ literal_array=literal_array,
+ bnl_array=bnl_array,))
+ else:
+ self.complex_exprs += 1
+ self.prop_list.append(dict(complex=branch))
+
+ @staticmethod
+ def __count_inlist_items(expr):
+ start = 0
+ end = 0
+ if ((start := expr.find('= ANY (')) > 0
+ and (end := expr.find(')', start)) > 0):
+ if m := in_list_item_extraction_pattern.search(expr[start:end]):
+ first = int(m.group('first'))
+ last = int(m.group('last'))
+ return last - first + 2
+ return len(expr[start:end].split(','))
+ return 0
+
+
+class PlanNodeCollectorContext:
+ def __init__(self):
+ self.table_node_map: Mapping[str: Iterable[ScanNode]] = dict()
+ self.pc = PlanClassifiers()
+
+ def __str__(self):
+ s = ''
+ for t, nodes in self.scan_nodes.items():
+ s += f' {t}: {len(nodes)} nodes'
+ for n in nodes:
+ s += f' {n.get_full_str()}'
+ s += f' plan_classifiers: [{self.pc}]'
+ return s
+
+
+class InvalidCostFixer(PlanNodeVisitor):
+ def __init__(self, root: PlanNode):
+ super().__init__()
+ self.root = root
+ self.error = False
+
+ def generic_visit(self, node):
+ if node.fixup_invalid_cost():
+ self.error = True
+ else:
+ super().generic_visit(node)
+ return self.error
+
+
+class PlanNodeCollector(PlanNodeVisitor):
+ def __init__(self, ctx, plan_ctx, node_context_map, logger):
+ super().__init__()
+ self.ctx = ctx
+ self.plan_ctx = plan_ctx
+ self.node_context_map = node_context_map
+ self.logger = logger
+ self.num_scans = 0
+ self.depth = 0
+ self.scan_node_width_map = self.compute_scan_node_width(plan_ctx.get_query())
+
+ def __enter(self):
+ self.ctx.pc.__init__()
+
+ def __exit(self):
+ self.ctx.pc.has_single_scan_node = (self.num_scans == 1)
+ self.ctx.pc.has_no_condition_scan = not any([
+ self.ctx.pc.has_key_access_index,
+ self.ctx.pc.has_scan_filter_index,
+ self.ctx.pc.has_tfbr_filter_index,
+ self.ctx.pc.has_table_filter_seqscan,
+ self.ctx.pc.has_local_filter,
+ ])
+
+ def generic_visit(self, node):
+ if self.depth == 0:
+ self.__enter()
+ self.depth += 1
+
+ classifiers = NodeClassifiers(node)
+ self.ctx.pc.update(classifiers)
+ self.node_context_map[id(node)] = NodeContext(self.plan_ctx, None, classifiers)
+ super().generic_visit(node)
+
+ self.depth -= 1
+ if self.depth == 0:
+ self.__exit()
+
+ def visit_scan_node(self, node):
+ if self.depth == 0:
+ self.__enter()
+ self.depth += 1
+ self.num_scans += 1
+
+ if int(node.nloops) > 0:
+ table = node.table_alias or node.table_name
+ node_width = self.scan_node_width_map.get(table)
+ # try postgres-generated number suffixed alias
+ if (not node_width and node.table_alias
+ and (m := re.fullmatch(fr'({node.table_name})_\d+', node.table_alias))):
+ table = m.group(1)
+ node_width = self.scan_node_width_map.get(table)
+ # use the estimated width if still no avail (TAQO collector was not able to find
+ # matching table/field metadata)
+ if not node_width:
+ node_width = node.plan_width
+
+ classifiers = NodeClassifiers(node)
+ self.ctx.pc.update(classifiers)
+ self.node_context_map[id(node)] = NodeContext(self.plan_ctx, node_width, classifiers)
+
+ if (node.is_seq_scan
+ or node.is_any_index_scan
+ or node.node_type in ['Bitmap Index Scan', 'Bitmap Heap Scan']):
+ self.ctx.table_node_map.setdefault(table, list()).append(node)
+ else:
+ self.warn_once(f'Unknown ScanNode: node_type={node.node_type}')
+
+ super().generic_visit(node)
+
+ self.depth -= 1
+ if self.depth == 0:
+ self.__exit()
+
+ @staticmethod
+ def compute_scan_node_width(query):
+ scan_node_width_map = dict()
+ if not query or not query.tables:
+ return dict()
+ for t in query.tables:
+ width = 0
+ for f in t.fields:
+ width += f.avg_width or f.defined_width
+ scan_node_width_map[t.alias or t.name] = width
+ return scan_node_width_map
+
+
+class CostMetrics:
+ def __init__(self, model):
+ self.model = model
+ self.logger = Config().logger
+ self.table_row_map: Mapping[str: float] = dict()
+ self.column_position_map: Mapping[str: int] = dict()
+ self.node_context_map: Mapping[int: NodeContext] = dict()
+ self.query_context_map: Mapping[str: QueryContext] = dict()
+ self.query_table_node_map: Mapping[str: Mapping[str: Iterable[ScanNode]]] = dict()
+ self.expr_analyzers: Mapping[str: ExpressionAnalyzer] = dict()
+
+ self.num_plans: int = 0
+ self.num_invalid_cost_plans: int = 0
+ self.num_invalid_cost_plans_fixed: int = 0
+ self.num_no_opt_queries: int = 0
+ self.warned = set()
+
+ def warn_once(self, msg):
+ if msg not in self.warned:
+ self.logger.warn(msg)
+ self.warned.add(msg)
+
+ def add_table_metadata(self, tables):
+ for t in tables:
+ self.table_row_map[t.name] = t.rows
+ for f in t.fields:
+ self.column_position_map[f'{t.name}:{f.name}'] = f.position
+
+ def process_plan(self, ctx, parent_query, index):
+ query = parent_query if index is None else parent_query.optimizations[index]
+ if not (plan := query.execution_plan):
+ self.logger.warn(f"=== Query ({index or 'default'}): [{query.query}]"
+ " does not have any valid plan\n")
+ return
+
+ if not (ptree := plan.parse_plan()):
+ self.logger.warn(f"=== Failed to parse plan ===\n{plan.full_str}\n")
+ else:
+ self.num_plans += 1
+ if not ptree.has_valid_cost():
+ self.num_invalid_cost_plans += 1
+ invalid_cost_plan = (f'hints: [{query.explain_hints}]\n'
+ f'{PlanPrinter.build_plan_tree_str(ptree, actual=False)}')
+ self.logger.debug(f'=== Found plan with invalid costs ===\n{invalid_cost_plan}\n')
+ if InvalidCostFixer(ptree).visit(ptree):
+ self.logger.warn('*** Failed to fixup invalid costs:\n====\n'
+ f'{invalid_cost_plan}\n==== Skipping...')
+ return
+
+ self.num_invalid_cost_plans_fixed += 1
+ self.logger.debug('=== Fixed up invalid costs successfully ===\n'
+ f'{PlanPrinter.build_plan_tree_str(ptree, actual=False)}')
+
+ pctx = PlanContext(parent_query, index, ptree)
+ PlanNodeCollector(ctx, pctx, self.node_context_map, self.logger).visit(ptree)
+
+ def add_query(self, query: type[Query]):
+ self.logger.debug(f'Adding {query.tag} {query.query_hash}: {query.query}...')
+ self.add_table_metadata(query.tables)
+
+ pc = PlanClassifiers()
+ ctx = PlanNodeCollectorContext()
+ self.process_plan(ctx, query, index=None)
+ pc.merge(ctx.pc)
+
+ if not query.optimizations:
+ self.num_no_opt_queries += 1
+ else:
+ for ix, opt in enumerate(query.optimizations):
+ if opt.execution_plan and opt.execution_plan.full_str:
+ self.process_plan(ctx, query, ix)
+ pc.merge(ctx.pc)
+
+ pc.is_single_table = len(query.tables) == 1
+
+ self.logger.debug(f'query classifiers: [{pc}]')
+
+ self.query_table_node_map[query.query] = dict()
+
+ for table, node_list in ctx.table_node_map.items():
+ self.query_table_node_map[query.query].setdefault(table, list()).extend(node_list)
+
+ self.query_context_map[query.query] = QueryContext(query, pc)
+
+ def get_node_query(self, node):
+ return self.node_context_map[id(node)].get_query()
+
+ def get_node_parent_query(self, node):
+ return self.node_context_map[id(node)].get_parent_query()
+
+ def get_node_query_str(self, node):
+ return self.get_node_query(node).query
+
+ def get_node_plan_tree(self, node):
+ return self.node_context_map[id(node)].get_plan_tree()
+
+ def get_node_classifiers(self, node):
+ return self.node_context_map[id(node)].node_classifiers
+
+ @staticmethod
+ def get_per_row_cost(node):
+ return ((float(node.total_cost) - float(node.startup_cost))
+ / (float(node.plan_rows) if float(node.plan_rows) else 1))
+
+ @staticmethod
+ def get_per_row_time(node):
+ return ((float(node.total_ms) - float(node.startup_ms))
+ / (float(node.rows) if float(node.rows) else 1))
+
+ def get_node_width(self, node):
+ return (0 if self.is_no_project_query(self.get_node_query(node).query)
+ else int(self.node_context_map[id(node)].node_width))
+
+ def get_columns_in_query(self, query_str):
+ return self.query_context_map[query_str].get_columns()
+
+ def has_no_condition(self, node):
+ return self.get_node_classifiers(node).has_no_condition
+
+ def has_no_local_filtering(self, node):
+ nc = self.get_node_classifiers(node)
+ return not nc.has_local_filter and not nc.has_rows_removed_by_recheck
+
+ def has_only_simple_condition(self, node, index_cond_only=False, index_key_prefix_only=False):
+ nc = self.get_node_classifiers(node)
+ if (nc.has_partial_aggregate
+ or not any([nc.has_index_access_cond,
+ nc.has_scan_filter,
+ nc.has_tfbr_filter])):
+ return False
+ conds = list()
+
+ if node.index_name:
+ conds += [node.get_index_cond()]
+ if index_key_prefix_only:
+ key_cols, _ = self.get_index_columns(node.index_name)
+ if not self.get_expression_analyzer(*conds).has_key_prefix_cond_only(key_cols):
+ return False
+ if not index_cond_only:
+ conds += [node.get_remote_filter(), node.get_remote_tfbr_filter()]
+ elif nc.has_scan_filter or nc.has_tfbr_filter:
+ return False
+ cond_str = ' AND '.join(filter(lambda cond: bool(cond), conds))
+ return self.get_expression_analyzer(cond_str).is_simple_expr()
+
+ def has_only_scan_filter_condition(self, node):
+ nc = self.get_node_classifiers(node)
+ return (not any([nc.has_partial_aggregate,
+ nc.has_index_access_cond,
+ nc.has_local_filter])
+ and any([nc.has_scan_filter,
+ nc.has_tfbr_filter]))
+
+ def has_partial_aggregate(self, node):
+ return self.get_node_classifiers(node).has_partial_aggregate
+
+ def get_table_row_count(self, table_name):
+ if (nrows := self.table_row_map.get(table_name, -1)) < 0:
+ self.warn_once(f'{table_name}: table row count unavailable')
+ return float(nrows)
+
+ def get_table_column_count(self, table_name):
+ # TODO: get it from the catalog and save into the .json
+ if ts := table_stats_map.get(table_name):
+ return ts.ncols
+ self.warn_once(f'{table_name}: table column count unavailable')
+ return 1
+
+ def get_table_row_width(self, table_name):
+ # TODO: get it from the catalog and save into the .json
+ if ts := table_stats_map.get(table_name):
+ return ts.width
+ self.warn_once(f'{table_name}: table row width unavailable')
+ return 4
+
+ def get_table_column_position(self, table_name, column_name):
+ cname = f'{table_name}:{column_name}'
+ if (pos := self.column_position_map.get(cname, -1)) < 0:
+ self.warn_once(f'{cname}: table column position unavailable')
+ return float(pos)
+
+ @staticmethod
+ def get_index_columns(index_name):
+ # TODO: load the index key columns and save them into the .json
+ key_cols = list()
+ inc_cols = list()
+ if index_name == 't1000000m_pkey' or re.match(r't1000000c\d+_pkey', index_name):
+ key_cols = list(['c0'])
+ elif index_name.endswith('_pkey'):
+ key_cols = list(['c1'])
+ elif (m := index_key_extraction_pattern.match(index_name)):
+ key_cols = packed_column_list_pattern.findall(m.group('key'))
+ inc_cols = packed_column_list_pattern.findall(m.group('inc') or '')
+ return key_cols, inc_cols
+
+ def get_column_position(self, table_name, index_name, column_name):
+ if index_name:
+ key_cols, inc_cols = self.get_index_columns(index_name)
+ if column_name in key_cols:
+ return key_cols.index(column_name) + 1
+ if column_name in inc_cols:
+ return len(key_cols) + inc_cols.index(column_name) + 1
+ return -1
+ return self.get_table_column_position(table_name, column_name)
+
+ def get_single_column_query_column(self, node):
+ return self.get_columns_in_query(self.get_node_query_str(node))[0]
+
+ def get_single_column_query_column_position(self, node):
+ if not isinstance(node, ScanNode):
+ return -1
+ column_name = self.get_single_column_query_column(node)
+ return self.get_column_position(node.table_name, node.index_name, column_name)
+
+ def get_single_column_node_normalized_eq_cond_value(self, node):
+ if not isinstance(node, ScanNode):
+ return None
+ ea = self.get_expression_analyzer(node.get_search_condition_str())
+ if ea.is_simple_expr() and ea.simple_comp_exprs == 1:
+ prop = ea.prop_list[0]
+ if prop['op'] == '=':
+ cst = column_stats_map.get(f'{node.table_name}.{prop["column"]}')
+ return cst.normalize_value(int(prop['rhs']))
+ return None
+
+ def get_plan_classifiers(self, query_str):
+ return self.query_context_map[query_str].pc
+
+ def is_single_table_query(self, query_str):
+ return self.get_plan_classifiers(query_str).is_single_table
+
+ def is_no_project_query(self, query_str):
+ return (query_str.lower().startswith('select 0 from')
+ or query_str.lower().startswith('select count(*) from'))
+
+ def has_no_filter_indexscan(self, query_str):
+ return self.get_plan_classifiers(query_str).has_no_filter_index
+
+ def has_scan_filter_indexscan(self, query_str):
+ return self.get_plan_classifiers(query_str).has_scan_filter_index
+
+ def has_local_filter(self, query_str):
+ return self.get_plan_classifiers(query_str).has_local_filter
+
+ def has_aggregate(self, query_str):
+ return self.get_plan_classifiers(query_str).has_aggregate
+
+ @staticmethod
+ def wrap_expr(expr, len):
+ line_start = 0
+ lines = list()
+ for m in re.finditer(r'\w+', expr):
+ if m.end() - line_start > len:
+ lines += [expr[line_start:m.start()]]
+ line_start = m.start()
+
+ lines += [expr[line_start:]]
+ return lines
+
+ def get_expression_analyzer(self, expr):
+ ea = self.expr_analyzers.get(expr)
+ if not ea:
+ ea = ExpressionAnalyzer(expr)
+ self.expr_analyzers[expr] = ea
+ return ea
+
+ def count_non_contiguous_literal_inlist_items(self, table, expr):
+ num_item_list = list()
+ for ea_prop in self.get_expression_analyzer(expr).prop_list:
+ if literal_array := ea_prop.get('literal_array'):
+ if (table and (cst := column_stats_map.get(f'{table}.{ea_prop["column"]}'))
+ and cst.ndv and cst.vmin): # not empty table and not all-nulls
+ gap = cst.get_avg_value_distance()
+ ar = literal_array.split(',')
+ if gap and (int(ar[-1]) - int(ar[0]))/gap + 1 > len(ar):
+ ngaps = 0
+ for v1, v2 in pairwise(ar):
+ ngaps += int(bool(int(v2) - int(v1)))
+
+ num_item_list.append(ngaps + 1)
+
+ return num_item_list
+
+ def build_non_contiguous_literal_inlist_count_str(self, table, expr):
+ num_item_list = self.count_non_contiguous_literal_inlist_items(table, expr)
+ return ('x'.join(filter(lambda item: bool(item), map(str, sorted(num_item_list))))
+ if num_item_list else '1')
+
+ def has_literal_inlist_index_cond(self, node, single_in_list_only=False):
+ if not isinstance(node, ScanNode):
+ return False
+ ea = self.get_expression_analyzer(node.get_index_cond())
+ if single_in_list_only:
+ return (len(ea.columns) == 1
+ and ea.simple_comp_exprs == 0
+ and ea.literal_in_lists == 1
+ and ea.bnl_in_lists == 0
+ and ea.complex_exprs == 0)
+ return ea.literal_in_lists > 0
+
+ def has_bnl_inlist_index_cond(self, node):
+ return (self.get_expression_analyzer(node.get_index_cond()).bnl_in_lists > 0
+ if isinstance(node, ScanNode) else False)
+
+ def get_index_key_prefix_gaps(self, node):
+ if (not isinstance(node, ScanNode)
+ or not node.index_name
+ or not (expr := node.get_index_cond())):
+ return 1
+ return index_prefix_gap_map.get(f'{node.index_name}:{expr}', 1)
+
+ @staticmethod
+ def gather_index_prefix_gap_query_parts(index_name, ea: ExpressionAnalyzer):
+ if ((m := index_key_extraction_pattern.match(index_name))
+ and (table := m.group("table"))
+ and (key_cols := packed_column_list_pattern.findall(m.group('key')))):
+ last_cond_key_pos = len(key_cols)
+ for pos in reversed(range(last_cond_key_pos)):
+ if key_cols[pos] in ea.columns:
+ last_cond_key_pos = pos
+ break
+
+ gap_cols = list()
+ cond = list()
+ for col in key_cols[:last_cond_key_pos]:
+ if col not in ea.columns:
+ gap_cols += [col]
+ else:
+ for prop in ea.prop_list:
+ if prop['column'] == col:
+ op = prop['op']
+ if op != '=':
+ gap_cols += [col]
+ cond += [f'({col} {op} {prop["rhs"]})']
+
+ if gap_cols:
+ return gap_cols, table, cond
+
+ return list(), '', list()
+
+ @staticmethod
+ def build_index_prefix_gap_query(gap_cols, table, cond):
+ if not gap_cols or not table:
+ return ''
+ s = 'select count(*) from (select distinct '
+ s += ', '.join(gap_cols)
+ s += f' from {table}'
+ if cond:
+ s += ' where '
+ s += ' and '.join(cond)
+
+ s += ') v'
+ return s
+
+ def build_index_prefix_gap_queries(self):
+ scan_node_list = list()
+ for table_node_list_map in self.query_table_node_map.values():
+ if nlist := (table_node_list_map.get('t1000000m')
+ or table_node_list_map.get('t1000000c10')):
+ scan_node_list += filter(lambda node: (bool(node.index_name)
+ and node.get_index_cond()), nlist)
+
+ querymap = dict()
+ queries = set()
+ for node in scan_node_list:
+ indname = node.index_name
+ expr = node.get_index_cond()
+ ea = self.get_expression_analyzer(expr)
+ indexpr = f'{indname}:{expr}'
+ if indexpr not in querymap:
+ if query := self.build_index_prefix_gap_query(
+ *self.gather_index_prefix_gap_query_parts(indname, ea)):
+ queries.add(query)
+ querymap[indexpr] = query
+
+ self.logger.info(f'=== generating index prefix gap queries ({len(queries)}) ===')
+ with open('report/index-gap-queries.sql', 'w') as gap_queries:
+ lines = ';\n'.join(sorted(queries))
+ lines += ';\n'
+ gap_queries.write(lines)
+ self.logger.info(f'=== generating index prefix gap map entries ({len(querymap)})===')
+ with open('report/index-gap-map.in', 'w') as gap_map:
+ lines = ''
+ for indexpr, query in sorted(querymap.items()):
+ lines += ' ("'
+ lines += '"\n "'.join(self.wrap_expr(indexpr, 72))
+ lines += f'"): {{{query}}},\n'
+ gap_map.write(lines)
diff --git a/src/actions/reports/regression.py b/src/actions/reports/regression.py
new file mode 100644
index 00000000..76256416
--- /dev/null
+++ b/src/actions/reports/regression.py
@@ -0,0 +1,680 @@
+from dataclasses import dataclass
+from typing import Type
+
+import numpy as np
+from sql_formatter.core import format_sql
+from matplotlib import pyplot as plt
+from matplotlib import rcParams
+
+from collect import CollectResult
+from objects import Query
+from actions.report import AbstractReportAction
+from utils import allowed_diff, get_plan_diff, extract_execution_time_from_analyze, calculate_client_execution_time
+
+
+@dataclass
+class ShortSummaryReport:
+ diff_plans: int = 0
+ diff_rpc_calls: int = 0
+ diff_wait_times: int = 0
+ diff_scanned_rows: int = 0
+ diff_peak_memory: int = 0
+
+
+class RegressionReport(AbstractReportAction):
+ def __init__(self):
+ super().__init__()
+
+ self.v1_name = None
+ self.v2_name = None
+ self.queries = {}
+ self.short_summary = ShortSummaryReport()
+
+ @classmethod
+ def generate_report(cls,
+ v1_name: str,
+ v2_name: str,
+ loq_v1: CollectResult,
+ loq_v2: CollectResult):
+ report = RegressionReport()
+
+ report.define_version_names(v1_name, v2_name)
+ report.define_version(loq_v1.db_version, loq_v2.db_version)
+
+ report.report_config(loq_v1.database_config, f"{v1_name} Flags")
+ report.report_config(loq_v1.config, v1_name)
+ report.report_config(loq_v2.database_config, f"{v2_name} Flags")
+ report.report_config(loq_v2.config, v2_name)
+
+ report.report_model(loq_v1.model_queries)
+
+ for query in loq_v1.queries:
+ if v2_query := loq_v2.find_query_by_hash(query.query_hash):
+ report.add_query(query, v2_query)
+ else:
+ report.logger.warn(f"{query.query_hash} is not found in v2!")
+
+ report.build_report()
+ report.build_xls_report()
+
+ report.publish_report("regression")
+ report.publish_short_report()
+
+ def get_report_name(self):
+ return "Regression"
+
+ def define_version(self, first_version, second_version):
+ self.content += f"[GIT COMMIT/VERSION]\n====\n" \
+ f"First:\n{first_version}\n\nSecond:\n{second_version}\n====\n\n"
+
+ def add_query(self, first_query: Type[Query], second_query: Type[Query]):
+ if first_query.tag not in self.queries:
+ self.queries[first_query.tag] = [[first_query, second_query], ]
+ else:
+ self.queries[first_query.tag].append([first_query, second_query])
+
+ def create_query_plot(self, best_optimization, optimizations, query, postfix_name):
+ if not optimizations:
+ return "NO PLOT"
+
+ rcParams['font.family'] = 'serif'
+ rcParams['font.size'] = 6
+ plt.xlabel('Execution time [ms]')
+ plt.ylabel('Predicted cost')
+
+ plt.plot([q.execution_time_ms for q in optimizations if q.execution_time_ms > 0],
+ [q.execution_plan.get_estimated_cost() for q in optimizations if
+ q.execution_time_ms > 0], 'k.',
+ [query.execution_time_ms],
+ [query.execution_plan.get_estimated_cost()], 'r^',
+ [best_optimization.execution_time_ms],
+ [best_optimization.execution_plan.get_estimated_cost()], 'go')
+
+ file_name = f'imgs/query_{self.reported_queries_counter}_{postfix_name}.png'
+ plt.savefig(f"report/{self.start_date}/{file_name}", dpi=300)
+ plt.close()
+
+ return file_name
+
+ @staticmethod
+ def fix_last_newline_in_result(result, rows):
+ if result:
+ splitted_result = result.split("\n")
+ result = "\n".join(splitted_result[:-1])
+ last_newline = splitted_result[-1]
+ rows[0] = f"{last_newline}{rows[0]}"
+ result += "\n"
+
+ return result
+
+ @staticmethod
+ def generate_regression_and_standard_errors(x_data, y_data):
+ x = np.array(x_data)
+ y = np.array(y_data)
+ n = x.size
+
+ a, b = np.polyfit(x, y, deg=1)
+ y_est = a * x + b
+ y_err = (y - y_est).std() * np.sqrt(1 / n + (x - x.mean()) ** 2 / np.sum((x - x.mean()) ** 2))
+
+ fig, ax = plt.subplots()
+
+ plt.xlabel('Predicted cost')
+ plt.ylabel('Execution time [ms]')
+
+ ax.plot(x, y_est, '-')
+ ax.fill_between(x, y_est - y_err, y_est + y_err, alpha=0.2)
+ ax.plot(x, y, 'k.')
+
+ return fig
+
+ def create_default_query_plots(self):
+ file_names = ['imgs/all_queries_defaults_yb_v1.png',
+ 'imgs/all_queries_defaults_yb_v2.png']
+
+ for i in range(2):
+ x_data = []
+ y_data = []
+
+ for tag, queries in self.queries.items():
+ for yb_pg_queries in queries:
+ query = yb_pg_queries[i]
+ if query.execution_time_ms:
+ x_data.append(query.execution_plan.get_estimated_cost())
+ y_data.append(query.execution_time_ms)
+
+ fig = self.generate_regression_and_standard_errors(x_data, y_data)
+ fig.savefig(f"report/{self.start_date}/{file_names[i]}", dpi=300)
+ plt.close()
+
+ return file_names
+
+ def build_report(self):
+ # link to top
+ self.add_plan_comparison()
+ self.add_rpc_calls()
+ self.add_rpc_wait_times()
+ self.add_scanned_rows()
+ self.add_peak_memory_collapsible()
+
+ self.start_table("2")
+ self.content += f"|{self.v1_name}|{self.v2_name}\n"
+ default_query_plots = self.create_default_query_plots()
+ self.content += f"a|image::{default_query_plots[0]}[{self.v1_name},align=\"center\"]\n"
+ self.content += f"a|image::{default_query_plots[1]}[{self.v2_name},align=\"center\"]\n"
+ self.end_table()
+
+ self.content += "\n== QO score\n"
+
+ yb_v1_bests = 0
+ yb_v2_bests = 0
+ qe_bests_geo = []
+ qe_default_geo = []
+ qo_yb_v1_bests = []
+ qo_yb_v2_bests = []
+ total = 0
+
+ better_different_plans = 0
+ worse_different_plans = 0
+ better_same_plans = 0
+ worse_same_plans = 0
+ same_plans = 0
+
+ v2_has_optimizations = True
+
+ for queries in self.queries.values():
+ for query in queries:
+ yb_v1_query = query[0]
+ yb_v2_query = query[1]
+
+ if yb_v2_query.optimizations is None:
+ v2_has_optimizations = False
+
+ yb_v1_best = yb_v1_query.get_best_optimization(self.config)
+ yb_v2_best = yb_v2_query.get_best_optimization(self.config) if v2_has_optimizations else yb_v1_best
+
+ v1_success = yb_v1_query.execution_time_ms > 0
+ v2_success = yb_v2_query.execution_time_ms > 0
+
+ if v1_success and v2_success:
+ better_different_plans += 1 if ((
+ yb_v1_query.execution_time_ms > yb_v2_query.execution_time_ms and
+ not allowed_diff(self.config, yb_v1_query.execution_time_ms,
+ yb_v2_query.execution_time_ms)) and
+ not yb_v1_query.compare_plans(yb_v2_query)) \
+ else 0
+ worse_different_plans += 1 if (
+ yb_v1_query.execution_time_ms < yb_v2_query.execution_time_ms and
+ not allowed_diff(self.config, yb_v1_query.execution_time_ms,
+ yb_v2_query.execution_time_ms) and
+ not yb_v1_query.compare_plans(yb_v2_query)) \
+ else 0
+ better_same_plans += 1 if ((
+ yb_v1_query.execution_time_ms > yb_v2_query.execution_time_ms and
+ not allowed_diff(self.config, yb_v1_query.execution_time_ms, yb_v2_query.execution_time_ms)) and
+ yb_v1_query.compare_plans(yb_v2_query)) \
+ else 0
+ worse_same_plans += 1 if (
+ yb_v1_query.execution_time_ms < yb_v2_query.execution_time_ms and
+ not allowed_diff(self.config, yb_v1_query.execution_time_ms, yb_v2_query.execution_time_ms) and
+ yb_v1_query.compare_plans(yb_v2_query)) \
+ else 0
+ same_plans += 1 if (
+ yb_v1_query.compare_plans(yb_v2_query) and
+ allowed_diff(self.config, yb_v1_query.execution_time_ms, yb_v2_query.execution_time_ms)) \
+ else 0
+
+ qe_default_geo.append(yb_v2_query.execution_time_ms / yb_v1_query.execution_time_ms
+ if v1_success and v2_success else 1)
+ qe_bests_geo.append(yb_v2_query.execution_time_ms / yb_v1_query.execution_time_ms
+ if v1_success and v2_success else 1)
+
+ if v1_success and v2_success:
+ qo_yb_v1_bests.append((yb_v1_query.execution_time_ms
+ if yb_v1_query.execution_time_ms > 0 else 1.0) / \
+ (yb_v1_best.execution_time_ms
+ if yb_v1_best.execution_time_ms > 0 else 1))
+ if v1_success and v2_success:
+ qo_yb_v2_bests.append(yb_v2_query.execution_time_ms / yb_v2_best.execution_time_ms \
+ if yb_v2_best.execution_time_ms > 0 else 9999999)
+
+ yb_v1_bests += 1 if yb_v1_query.compare_plans(yb_v1_best) else 0
+ yb_v2_bests += 1 if yb_v2_query.compare_plans(yb_v2_best) else 0
+
+ total += 1
+
+ self.start_table("4,1,1")
+ self.content += f"|Statistic|{self.v1_name}|{self.v2_name}\n"
+ self.content += f"|Best execution plan picked|{'{:.2f}'.format(float(yb_v1_bests) * 100 / total)}%" \
+ f"|{'{:.2f}'.format(float(yb_v2_bests) * 100 / total)}%\n"
+ self.content += f"|Different better plans (exec time v1 > v2, outside {self.config.skip_percentage_delta} range)\n" \
+ f"2+m|{better_different_plans}\n"
+ self.content += f"|Different worse plans (exec time v1 < v2, not in {self.config.skip_percentage_delta} range)\n" \
+ f"2+m|{worse_different_plans}\n"
+ self.content += f"|Same better plans (exec time v1 > v2, outside {self.config.skip_percentage_delta} range)\n" \
+ f"2+m|{better_same_plans}\n"
+ self.content += f"|Same worse plans (exec time v1 < v2, not in {self.config.skip_percentage_delta} range)\n" \
+ f"2+m|{worse_same_plans}\n"
+ self.content += f"|Total same plans\n" \
+ f"2+m|{same_plans}\n"
+ self.content += f"|Geomeric mean QE default\n" \
+ f"2+m|{'{:.2f}'.format(self.geo_mean(qe_default_geo))}\n"
+
+ if v2_has_optimizations:
+ self.content += f"|Geomeric mean QE best\n" \
+ f"2+m|{'{:.2f}'.format(self.geo_mean(qe_bests_geo))}\n"
+
+ self.content += f"|Geomeric mean QO default vs best" \
+ f"|{'{:.2f}'.format(self.geo_mean(qo_yb_v1_bests))}" \
+ f"|{'{:.2f}'.format(self.geo_mean(qo_yb_v2_bests))}\n"
+ self.end_table()
+
+ self.content += "\n[#top]\n== QE score\n"
+
+ num_columns = 7 if v2_has_optimizations else 6
+ v2_prefix = "Best" if v2_has_optimizations else "Default"
+ v2_best_col = f"|{self.v2_name} {v2_prefix}" if v2_has_optimizations else ""
+ table_layout = "1,1,1,1,1,1,4" if v2_has_optimizations else "1,1,1,1,1,4"
+ for tag, queries in self.queries.items():
+ self.start_table(table_layout)
+ self.content += f"|{self.v1_name}" \
+ f"|{self.v1_name} Best" \
+ f"|{self.v2_name}" \
+ f"{v2_best_col}" \
+ f"|Ratio {self.v2_name} vs Default {self.v1_name}" \
+ f"|Ratio {v2_prefix} {self.v2_name} vs Best {self.v1_name}" \
+ f"|Query\n"
+ self.content += f"{num_columns}+m|{tag}.sql\n"
+ for query in queries:
+ yb_v1_query = query[0]
+ yb_v2_query = query[1]
+
+ yb_v1_best = yb_v1_query.get_best_optimization(self.config)
+ yb_v2_best = yb_v2_query.get_best_optimization(self.config) if v2_has_optimizations else yb_v1_best
+
+ success = yb_v2_query.execution_time_ms > 0
+
+ default_v1_equality = "[green]" \
+ if yb_v1_query.compare_plans(yb_v1_best) else "[red]"
+ default_v2_equality = "[green]" \
+ if success and yb_v2_query.compare_plans(yb_v2_best) else "[red]"
+
+ if v2_has_optimizations:
+ best_yb_pg_equality = "(eq) " if yb_v1_best.compare_plans(yb_v2_best) else ""
+ else:
+ best_yb_pg_equality = "(eq) " if yb_v1_best.compare_plans(yb_v2_query) else ""
+
+ ratio_x3 = yb_v2_query.execution_time_ms / yb_v1_query.execution_time_ms \
+ if yb_v1_query.execution_time_ms > 0 else 99999999
+ ratio_x3_str = "{:.2f}".format(yb_v2_query.execution_time_ms / yb_v1_query.execution_time_ms
+ if yb_v2_query.execution_time_ms > 0 else 99999999)
+ ratio_color = "[green]" if ratio_x3 <= (1.0 + self.config.skip_percentage_delta) else "[red]"
+
+ if v2_has_optimizations:
+ ratio_best = yb_v2_best.execution_time_ms / yb_v1_best.execution_time_ms \
+ if yb_v1_best.execution_time_ms > 0 and success else 99999999
+ ratio_best_x3_str = "{:.2f}".format(yb_v2_best.execution_time_ms / yb_v1_best.execution_time_ms
+ if yb_v1_best.execution_time_ms > 0 and success else 99999999)
+ ratio_best_color = "[green]" if ratio_best <= (1.0 + self.config.skip_percentage_delta) else "[red]"
+ else:
+ ratio_best = yb_v2_query.execution_time_ms / yb_v1_best.execution_time_ms \
+ if yb_v1_best.execution_time_ms > 0 and success else 99999999
+ ratio_best_x3_str = "{:.2f}".format(yb_v2_query.execution_time_ms / yb_v1_best.execution_time_ms
+ if yb_v1_best.execution_time_ms > 0 and success else 99999999)
+ ratio_best_color = "[green]" if ratio_best <= (1.0 + self.config.skip_percentage_delta) else "[red]"
+
+ bitmap_flag = "[blue]" \
+ if success and "bitmap" in yb_v2_query.execution_plan.full_str.lower() else "[black]"
+
+ b2_best_col = f"a|{default_v2_equality}#*{'{:.2f}'.format(yb_v2_best.execution_time_ms)}*#\n" \
+ if v2_has_optimizations else ""
+
+ self.content += f"a|[black]#*{'{:.2f}'.format(yb_v1_query.execution_time_ms)}*#\n" \
+ f"a|{default_v1_equality}#*{'{:.2f}'.format(yb_v1_best.execution_time_ms)}*#\n" \
+ f"a|{bitmap_flag}#*{'{:.2f}'.format(yb_v2_query.execution_time_ms)}*#\n" \
+ f"{b2_best_col}" \
+ f"a|{ratio_color}#*{ratio_x3_str}*#\n" \
+ f"a|{ratio_best_color}#*{best_yb_pg_equality}{ratio_best_x3_str}*#\n"
+
+ self.content += f"a|[#{yb_v1_query.query_hash}_top]"
+ self.append_tag_page_link(tag, yb_v1_query.query_hash, f"Query {yb_v1_query.query_hash}")
+
+ self.start_source(["sql"])
+ self.content += format_sql(yb_v2_query.get_reportable_query())
+ self.end_source()
+ self.content += "\n"
+ self.end_table_row()
+
+ self.end_table()
+
+ # different results links
+ for tag in self.queries.keys():
+ self.append_tag_page_link(tag, None, f"{tag} queries file")
+
+ for tag, queries in self.queries.items():
+ sub_report = self.create_sub_report(tag)
+ sub_report.content += f"\n[#{tag}]\n== {tag} queries file\n\n"
+ for query in queries:
+ self.__report_query(sub_report, query[0], query[1])
+
+ def add_plan_comparison(self):
+ self.start_collapsible("Plan comparison")
+ self.content += "\n[#plans_summary]\n"
+ self.start_table("2")
+ for tag, queries in self.queries.items():
+ num_same_plans = sum(1 for query in queries
+ if query[0].compare_plans(query[1]))
+ self.content += f"a|<<{tag}>>\n"
+ self.short_summary.diff_plans = len(queries) - num_same_plans
+ color = "[green]" if self.short_summary.diff_plans == 0 else "[orange]"
+ self.content += f"a|{color}#*{self.short_summary.diff_plans}*#\n"
+ self.end_table_row()
+ self.end_table()
+ self.end_collapsible()
+
+ def add_rpc_calls(self):
+ self.start_collapsible("RPC Calls")
+ self.content += "\n[#rpc_summary]\n"
+ self.start_table("2")
+ for tag, queries in self.queries.items():
+ self.short_summary.diff_rpc_calls = sum(
+ query[0].execution_plan.get_rpc_calls() != query[1].execution_plan.get_rpc_calls()
+ for query in queries
+ )
+ self.content += f"a|<<{tag}>>\n"
+ color = "[green]" if self.short_summary.diff_rpc_calls == 0 else "[orange]"
+ self.content += f"a|{color}#*{self.short_summary.diff_rpc_calls}*#\n"
+ self.end_table_row()
+ self.end_table()
+ self.end_collapsible()
+
+ def add_rpc_wait_times(self):
+ self.start_collapsible("RPC Wait Times")
+ self.content += "\n[#rpc_wait_summary]\n"
+ self.start_table("2")
+ for tag, queries in self.queries.items():
+ self.short_summary.diff_wait_times = sum(
+ query[0].execution_plan.get_rpc_wait_times() != query[1].execution_plan.get_rpc_wait_times()
+ for query in queries
+ )
+ self.content += f"a|<<{tag}>>\n"
+ color = "[green]" if self.short_summary.diff_wait_times == 0 else "[orange]"
+ self.content += f"a|{color}#*{self.short_summary.diff_wait_times}*#\n"
+ self.end_table_row()
+ self.end_table()
+ self.end_collapsible()
+
+ def add_scanned_rows(self):
+ self.start_collapsible("Scanned rows")
+ self.content += "\n[#rows_summary]\n"
+ self.start_table("2")
+ for tag, queries in self.queries.items():
+ num_same_plans = sum(
+ query[0].execution_plan.get_scanned_rows() != query[1].execution_plan.get_scanned_rows()
+ for query in queries
+ )
+ self.content += f"a|<<{tag}>>\n"
+ color = "[green]" if num_same_plans == 0 else "[orange]"
+ self.content += f"a|{color}#*{num_same_plans}*#\n"
+ self.end_table_row()
+ self.end_table()
+ self.end_collapsible()
+
+ def add_peak_memory_collapsible(self):
+ self.start_collapsible("Peak memory")
+ self.content += "\n[#memory_summary]\n"
+ self.start_table("2")
+ for tag, queries in self.queries.items():
+ self.short_summary.diff_peak_memory = sum(
+ query[0].execution_plan.get_peak_memory() != query[1].execution_plan.get_peak_memory()
+ for query in queries
+ )
+ self.content += f"a|<<{tag}>>\n"
+ color = "[green]" if self.short_summary.diff_peak_memory == 0 else "[orange]"
+ self.content += f"a|{color}#*{self.short_summary.diff_peak_memory}*#\n"
+ self.end_table_row()
+ self.end_table()
+ self.end_collapsible()
+
+ # noinspection InsecureHash
+ def __report_query(self, report, v1_query: Type[Query], v2_query: Type[Query]):
+ v2_has_optimizations = v2_query.optimizations is not None
+
+ v1_best = v1_query.get_best_optimization(self.config)
+ v2_best = v2_query.get_best_optimization(self.config) if v2_has_optimizations else v1_best
+
+ self.reported_queries_counter += 1
+
+ report.content += f"\n[#{v1_query.query_hash}]\n"
+ report.content += f"=== Query {v1_query.query_hash}"
+ report.content += f"\n{v1_query.tag}\n\n"
+ report.append_index_page_hashtag_link("top", "Go to index")
+ report.append_index_page_hashtag_link(f"{v1_query.query_hash}_top", "Show in summary")
+ report.add_double_newline()
+
+ report.start_source(["sql"])
+ report.content += format_sql(v1_query.get_reportable_query())
+ report.end_source()
+
+ if v2_has_optimizations:
+ report.start_table("2")
+ report.content += f"|{self.v1_name}|{self.v2_name}\n"
+ v1_query_plot = self.create_query_plot(v1_best, v1_query.optimizations, v1_query, "v1")
+ v2_query_plot = self.create_query_plot(v2_best, v2_query.optimizations, v2_query, "v2")
+ report.content += f"a|image::../{v1_query_plot}[{self.v1_name},align=\"center\"]\n"
+ report.content += f"a|image::../{v2_query_plot}[{self.v2_name},align=\"center\"]\n"
+ report.end_table()
+ else:
+ report.start_table("1")
+ report.content += f"|{self.v1_name}\n"
+ v1_query_plot = self.create_query_plot(v1_best, v1_query.optimizations, v1_query, "v1")
+ report.content += f"a|image::../{v1_query_plot}[{self.v1_name},align=\"center\",width=640,height=480]\n"
+ report.end_table()
+
+ report.add_double_newline()
+
+ report.add_double_newline()
+ default_v1_equality = "(eq) " if v1_query.compare_plans(v1_best) else ""
+
+ report.start_table("5")
+ report.content += f"|Metric|{self.v1_name}|{self.v1_name} Best|{self.v2_name}|{self.v2_name} Best\n"
+
+ default_v2_equality = "(eq) " if v2_query.compare_plans(v2_best) else ""
+ best_yb_pg_equality = "(eq) " if v1_best.compare_plans(v2_best) else ""
+ default_v1_v2_equality = "(eq) " if v1_query.compare_plans(v2_query) else ""
+
+ if 'order by' in v1_query.query:
+ report.start_table_row()
+ report.content += f"!! Result hash" \
+ f"|{v1_query.result_hash}" \
+ f"|{v1_best.result_hash}" \
+ f"|{v2_query.result_hash}" \
+ f"|{v2_best.result_hash}" \
+ if v2_query.result_hash != v1_query.result_hash else \
+ f"Result hash" \
+ f"|`{v1_query.result_hash}" \
+ f"|{v1_best.result_hash}" \
+ f"|{v2_query.result_hash}" \
+ f"|{v2_best.result_hash}"
+ report.end_table_row()
+
+ report.start_table_row()
+ report.content += f"Cardinality" \
+ f"|{v1_query.result_cardinality}" \
+ f"|{v1_best.result_cardinality}" \
+ f"|{v2_query.result_cardinality}" \
+ f"|{v2_best.result_cardinality}"
+ report.end_table_row()
+ report.start_table_row()
+ report.content += f"Estimated cost" \
+ f"|{v1_query.execution_plan.get_estimated_cost()}" \
+ f"|{default_v1_equality}{v1_best.execution_plan.get_estimated_cost()}" \
+ f"|{default_v1_v2_equality}{v2_query.execution_plan.get_estimated_cost()}" \
+ f"|{default_v2_equality}{v2_best.execution_plan.get_estimated_cost()}"
+ report.end_table_row()
+ report.start_table_row()
+ report.content += f"Execution time" \
+ f"|{'{:.2f}'.format(v1_query.execution_time_ms)}" \
+ f"|{default_v1_equality}{'{:.2f}'.format(v1_best.execution_time_ms)}" \
+ f"|{'{:.2f}'.format(v2_query.execution_time_ms)}" \
+ f"|{default_v2_equality}{'{:.2f}'.format(v2_best.execution_time_ms)}"
+ report.end_table_row()
+ # report.start_table_row()
+ # report.content += f"Server execution time" \
+ # f"|{'{:.2f}'.format(extract_execution_time_from_analyze(v1_query.execution_plan.full_str))}" \
+ # f"|{default_v1_equality}{'{:.2f}'.format(extract_execution_time_from_analyze(v1_best.execution_plan.full_str))}" \
+ # f"|{'{:.2f}'.format(extract_execution_time_from_analyze(v2_query.execution_plan.full_str))}" \
+ # f"|{default_v2_equality}{'{:.2f}'.format(extract_execution_time_from_analyze(v2_best.execution_plan.full_str))}"
+ # report.end_table_row()
+ # report.start_table_row()
+ # report.content += f"Result collect time" \
+ # f"|{'{:.2f}'.format(calculate_client_execution_time(v1_query))}" \
+ # f"|{default_v1_equality}{'{:.2f}'.format(calculate_client_execution_time(v1_best))}" \
+ # f"|{'{:.2f}'.format(calculate_client_execution_time(v2_query))}" \
+ # f"|{default_v2_equality}{'{:.2f}'.format(calculate_client_execution_time(v2_best))}"
+ # report.end_table_row()
+
+ report.end_table()
+
+ report.start_table()
+ report.start_table_row()
+
+ report.start_collapsible(f"{self.v1_name} default plan")
+ report.start_source(["diff"])
+ report.content += v1_query.execution_plan.full_str
+ report.end_source()
+ report.end_collapsible()
+
+ report.start_collapsible(f"{default_v1_equality}{self.v1_name} best plan")
+ report.start_source(["diff"])
+ report.content += v1_best.execution_plan.full_str
+ report.end_source()
+ report.end_collapsible()
+
+ report.start_collapsible(f"{self.v2_name} default plan")
+ report.start_source(["diff"])
+ report.content += v2_query.execution_plan.full_str
+ report.end_source()
+ report.end_collapsible()
+
+ if v2_has_optimizations:
+ v2_best = v2_query.get_best_optimization(self.config)
+ report.start_collapsible(f"{default_v2_equality}{self.v2_name} best plan")
+ report.start_source(["diff"])
+ report.content += v2_best.execution_plan.full_str
+ report.end_source()
+ report.end_collapsible()
+
+ v2_prefix = "best" if v2_has_optimizations else "default"
+ report.start_collapsible(f"{best_yb_pg_equality}{self.v1_name} best vs {self.v2_name} {v2_prefix}")
+ report.start_source(["diff"])
+ report.content += get_plan_diff(
+ v1_best.execution_plan.full_str,
+ v2_best.execution_plan.full_str if v2_has_optimizations else v2_query.execution_plan.full_str,
+ )
+ report.end_source()
+ report.end_collapsible()
+
+ report.content += f"{default_v1_equality}{self.v1_name} vs {self.v2_name}\n"
+ report.start_source(["diff"])
+ diff = get_plan_diff(
+ v1_query.execution_plan.full_str,
+ v2_query.execution_plan.full_str
+ )
+ if not diff:
+ diff = v1_query.execution_plan.full_str
+
+ report.content += diff
+ report.end_source()
+ report.end_table_row()
+
+ report.content += "\n"
+
+ report.end_table()
+
+ report.add_double_newline()
+
+ def build_xls_report(self):
+ import xlsxwriter
+
+ workbook = xlsxwriter.Workbook(f'report/{self.start_date}/report_regression.xls')
+ worksheet = workbook.add_worksheet()
+
+ head_format = workbook.add_format()
+ head_format.set_bold()
+ head_format.set_bg_color('#999999')
+
+ eq_format = workbook.add_format()
+ eq_format.set_bold()
+ eq_format.set_bg_color('#d9ead3')
+
+ eq_bad_format = workbook.add_format()
+ eq_bad_format.set_bold()
+ eq_bad_format.set_bg_color('#fff2cc')
+
+ worksheet.write(0, 0, "First", head_format)
+ worksheet.write(0, 1, "Best First", head_format)
+ worksheet.write(0, 2, "Second", head_format)
+ worksheet.write(0, 3, "Second / First", head_format)
+ worksheet.write(0, 4, "Second - First", head_format)
+ worksheet.write(0, 5, "Second / Best First", head_format)
+ worksheet.write(0, 6, "Second - Best First", head_format)
+ worksheet.write(0, 7, "Query", head_format)
+ worksheet.write(0, 8, "Query Hash", head_format)
+ worksheet.write(0, 9, "First Plan", head_format)
+ worksheet.write(0, 10, "First Best Plan", head_format)
+ worksheet.write(0, 11, "Second Plan", head_format)
+
+ row = 1
+ # Iterate over the data and write it out row by row.
+ for tag, queries in self.queries.items():
+ for query in queries:
+ first_query: Query = query[0]
+ second_query: Query = query[1]
+
+ ratio = second_query.execution_time_ms / first_query.execution_time_ms \
+ if first_query.execution_time_ms > 0 else 99999999
+ ratio_color = eq_bad_format if ratio > 1.0 else eq_format
+ delta = second_query.execution_time_ms - first_query.execution_time_ms
+
+ v1_best = first_query.get_best_optimization(self.config)
+ v1_best_time = v1_best.execution_time_ms
+ ratio_v2_vs_v1_best = second_query.execution_time_ms / v1_best_time \
+ if v1_best_time > 0 else 99999999
+ ratio_v2_vs_v1_best_color = eq_bad_format \
+ if ratio_v2_vs_v1_best > 1.0 else eq_format
+ v2_vs_v1_best_delta = second_query.execution_time_ms - v1_best_time
+
+ worksheet.write(row, 0, '{:.2f}'.format(first_query.execution_time_ms))
+ worksheet.write(row, 1, '{:.2f}'.format(v1_best_time))
+ worksheet.write(row, 2, '{:.2f}'.format(second_query.execution_time_ms))
+ worksheet.write(row, 3, f'{ratio}', ratio_color)
+ worksheet.write(row, 4, f'{delta}', ratio_color)
+ worksheet.write(row, 5, f'{ratio_v2_vs_v1_best}', ratio_v2_vs_v1_best_color)
+ worksheet.write(row, 6, '{:.2f}'.format(v2_vs_v1_best_delta),
+ ratio_v2_vs_v1_best_color)
+ worksheet.write(row, 7, f'{format_sql(first_query.query)}')
+ worksheet.write(row, 8, f'{first_query.query_hash}')
+ worksheet.write(row, 9, f'{first_query.execution_plan}')
+ worksheet.write(row, 10,
+ '<---' if v1_best.compare_plans(first_query)
+ else f'{v1_best.execution_plan}')
+ worksheet.write(row, 11,
+ '<---' if second_query.compare_plans(v1_best)
+ else f'{second_query.execution_plan}')
+ row += 1
+
+ worksheet.autofit()
+ workbook.close()
+
+ def define_version_names(self, v1_name, v2_name):
+ self.v1_name = v1_name
+ self.v2_name = v2_name
+
+ def publish_short_report(self):
+ with open(f"report/{self.start_date}/short_regression_summary.txt", "w") as short_summary:
+ short_summary.write(f"Changed plans: {self.short_summary.diff_plans}\n")
+ short_summary.write(f"Changed scanned rows: {self.short_summary.diff_scanned_rows}\n")
+ short_summary.write(f"Changed RPC calls: {self.short_summary.diff_rpc_calls}\n")
+ short_summary.write(f"Changed RPC wait times: {self.short_summary.diff_wait_times}\n")
+ short_summary.write(f"Changed peak memory: {self.short_summary.diff_peak_memory}\n")
diff --git a/src/actions/reports/score.py b/src/actions/reports/score.py
new file mode 100644
index 00000000..02fd13fe
--- /dev/null
+++ b/src/actions/reports/score.py
@@ -0,0 +1,838 @@
+from typing import Type
+
+import numpy as np
+from bokeh.embed import components
+from bokeh.layouts import gridplot
+from bokeh.models import ColumnDataSource, HoverTool, TapTool, BoxZoomTool, WheelZoomTool, PanTool, SaveTool, ResetTool
+from bokeh.models import OpenURL, CDSView, GroupFilter
+from bokeh.plotting import figure
+from bokeh.transform import factor_cmap
+from matplotlib import pyplot as plt
+from matplotlib import rcParams
+from scipy.stats import linregress
+from sql_formatter.core import format_sql
+
+from actions.report import AbstractReportAction
+from collect import CollectResult
+from db.postgres import PostgresQuery
+from objects import Query
+from utils import allowed_diff, get_plan_diff, extract_execution_time_from_analyze, calculate_client_execution_time
+
+
+class ScoreReport(AbstractReportAction):
+ def __init__(self):
+ super().__init__()
+
+ self.queries = {}
+ self.overall_plots = {
+ 'color': 'k.',
+ 'x_values': [],
+ 'y_values': []
+ }
+
+ @classmethod
+ def generate_report(cls, loq: CollectResult, pg_loq: CollectResult = None):
+ report = ScoreReport()
+
+ report.define_version(loq.db_version)
+ report.report_config(loq.database_config, "Flags")
+ report.report_config(loq.config, "YB")
+ report.report_config(pg_loq.config, "PG")
+
+ report.report_model(loq.model_queries)
+
+ for query in loq.queries:
+ pg_query = pg_loq.find_query_by_hash(query.query_hash) if pg_loq else None
+ if pg_query:
+ report.add_query(query, pg_query)
+ else:
+ report.logger.exception("No PG query found for hash %s", query.query_hash)
+ report.add_query(query, query.create_copy())
+
+ report.build_report()
+ report.build_xls_report()
+
+ report.publish_report("score")
+
+ def get_report_name(self):
+ return "score"
+
+ def define_version(self, version):
+ self.content += f"[VERSION]\n====\n{version}\n====\n\n"
+
+ def calculate_score(self, query):
+ if query.execution_time_ms == 0:
+ return -1
+ else:
+ return "{:.2f}".format(
+ query.get_best_optimization(self.config).execution_time_ms / query.execution_time_ms)
+
+ def create_default_query_plots_interactive(self):
+ data = {
+ 'query_hash': [],
+ 'query_tag': [],
+ 'query': [],
+ 'yb_cost': [],
+ 'yb_time': [],
+ 'pg_cost': [],
+ 'pg_time': [],
+ }
+
+ data_yb_bad_plans = {
+ 'yb_cost': [],
+ 'yb_time': [],
+ 'query_tag': [],
+ }
+
+ data_pg_bad_plans = {
+ 'pg_cost': [],
+ 'pg_time': [],
+ 'query_tag': [],
+ }
+
+ tags = []
+
+ for tag, queries in self.queries.items():
+ for yb_pg_queries in queries:
+ yb_query = yb_pg_queries[0]
+ pg_query = yb_pg_queries[1]
+ if yb_query and yb_query.execution_time_ms and pg_query and pg_query.execution_time_ms:
+ data["query_hash"].append(yb_query.query_hash)
+ data["query_tag"].append(tag)
+ tags.append(tag)
+ data["query"].append(yb_query.query)
+ data["yb_cost"].append(yb_query.execution_plan.get_estimated_cost())
+ data["yb_time"].append(yb_query.execution_time_ms)
+ data["pg_cost"].append(pg_query.execution_plan.get_estimated_cost())
+ data["pg_time"].append(pg_query.execution_time_ms)
+ yb_best = yb_query.get_best_optimization(self.config)
+ if not yb_query.compare_plans(yb_best):
+ data_yb_bad_plans["yb_cost"].append(yb_query.execution_plan.get_estimated_cost())
+ data_yb_bad_plans["yb_time"].append(yb_query.execution_time_ms)
+ data_yb_bad_plans["query_tag"].append(tag)
+ pg_best = pg_query.get_best_optimization(self.config)
+ if not pg_query.compare_plans(pg_best):
+ data_pg_bad_plans["pg_cost"].append(pg_query.execution_plan.get_estimated_cost())
+ data_pg_bad_plans["pg_time"].append(pg_query.execution_time_ms)
+ data_pg_bad_plans["query_tag"].append(tag)
+
+ source = ColumnDataSource(data)
+ source_yb_bad_plans = ColumnDataSource(data_yb_bad_plans)
+ source_pg_bad_plans = ColumnDataSource(data_pg_bad_plans)
+
+ TOOLTIPS = """
+
+ @query
+
+ """
+ hover_tool = HoverTool(tooltips=TOOLTIPS)
+ hover_tool.renderers = []
+ TOOLS = [TapTool(), BoxZoomTool(), WheelZoomTool(), PanTool(), SaveTool(), ResetTool(), hover_tool]
+
+ tags = sorted(list(set(data['query_tag'])))
+
+ # YB Plot
+ yb_plot = figure(x_axis_label='Estimated Cost',
+ y_axis_label='Execution Time (ms)',
+ title='Yugabyte',
+ width=600, height=600,
+ tools=TOOLS, active_drag=None)
+
+ for tag in tags:
+ view = CDSView(filter=GroupFilter(column_name='query_tag', group=tag))
+ # Highliht queries with bad plans
+ yb_plot.scatter("yb_cost", "yb_time", size=14, line_width=4,
+ source=source_yb_bad_plans, legend_label=tag, line_color='firebrick',
+ color=None, fill_alpha=0.0, view=view)
+ # Scatter plot for all queries
+ yb_scatter = yb_plot.scatter("yb_cost", "yb_time", size=10, source=source,
+ hover_color="black", legend_label=tag, view=view,
+ color=factor_cmap('query_tag', 'Category20_20', tags),
+ selection_color='black', nonselection_alpha=1.0)
+ hover_tool.renderers.append(yb_scatter)
+
+ # Interactive Legend
+ yb_plot.legend.click_policy = 'hide'
+
+ # Linear Regression Line
+ yb_x_np = np.array(data['yb_cost'])
+ yb_y_np = np.array(data['yb_time'])
+ try:
+ res = linregress(yb_x_np, yb_y_np)
+ yb_y_data_regress = res.slope * yb_x_np + res.intercept
+ yb_plot.line(x=yb_x_np, y=yb_y_data_regress)
+ except ValueError:
+ self.config.logger.warn(f"All x values are same. Linear regression not calculated.")
+
+ # Tap event to jump to query
+ yb_url = 'tags/@query_tag.html#@query_hash'
+ yb_taptool = yb_plot.select(type=TapTool)
+ yb_taptool.callback = OpenURL(url=yb_url, same_tab=True)
+
+ # PG Plot
+ pg_plot = figure(x_axis_label='Estimated Cost',
+ y_axis_label='Execution Time (ms)',
+ title='Postgres',
+ width=600, height=600,
+ tools=TOOLS, tooltips=TOOLTIPS, active_drag=None)
+
+ for tag in tags:
+ view = CDSView(filter=GroupFilter(column_name='query_tag', group=tag))
+ # Highliht queries with bad plans
+ pg_plot.scatter("pg_cost", "pg_time", size=14, line_width=4,
+ source=source_pg_bad_plans, legend_label=tag, line_color='firebrick',
+ color=None, fill_alpha=0.0, view=view)
+ # Scatter plot for all queries
+ pg_scatter = pg_plot.scatter("pg_cost", "pg_time", size=10, source=source,
+ hover_color="black", legend_label=tag, view=view,
+ color=factor_cmap('query_tag', 'Category20_20', tags),
+ selection_color='black', nonselection_alpha=1.0)
+ hover_tool.renderers.append(pg_scatter)
+
+ # Interactive Legend
+ pg_plot.legend.click_policy = 'hide'
+
+ # Linear Regression Line
+ pg_x_np = np.array(data['pg_cost'])
+ pg_y_np = np.array(data['pg_time'])
+ try:
+ res = linregress(pg_x_np, pg_y_np)
+ pg_y_data_regress = res.slope * pg_x_np + res.intercept
+ pg_plot.line(x=pg_x_np, y=pg_y_data_regress)
+ except ValueError:
+ self.config.logger.warn(f"All x values are same. Linear regression not calculated.")
+ # Tap event to jump to query
+ pg_url = 'tags/@query_tag.html#@query_hash'
+ pg_taptool = pg_plot.select(type=TapTool)
+ pg_taptool.callback = OpenURL(url=pg_url, same_tab=True)
+
+ GRIDPLOT = gridplot([[yb_plot, pg_plot]], sizing_mode='scale_both',
+ merge_tools=False)
+ script, div = components(GRIDPLOT)
+ return script, div
+
+ @staticmethod
+ def generate_regression_and_standard_errors(x_data, y_data):
+ x = np.array(x_data)
+ y = np.array(y_data)
+ n = x.size
+
+ a, b = np.polyfit(x, y, deg=1)
+ y_est = a * x + b
+ y_err = (y - y_est).std() * np.sqrt(1 / n + (x - x.mean()) ** 2 / np.sum((x - x.mean()) ** 2))
+
+ fig, ax = plt.subplots()
+
+ plt.xlabel('Predicted cost')
+ plt.ylabel('Execution time [ms]')
+
+ ax.plot(x, y_est, '-')
+ ax.fill_between(x, y_est - y_err, y_est + y_err, alpha=0.2)
+ ax.plot(x, y, 'k.')
+
+ return fig
+
+ def create_query_plot(self, best_optimization, optimizations, query, scale=""):
+ if not optimizations:
+ return "NO PLOT"
+
+ rcParams['font.family'] = 'serif'
+ rcParams['font.size'] = 6
+ plt.xlabel('Execution time [ms]')
+ plt.ylabel('Predicted cost')
+
+ if scale:
+ plt.xscale(scale)
+ plt.yscale(scale)
+
+ plt.plot([q.execution_time_ms for q in optimizations if q.execution_time_ms > 0],
+ [q.execution_plan.get_estimated_cost() for q in optimizations if
+ q.execution_time_ms > 0], 'k.',
+ [query.execution_time_ms],
+ [query.execution_plan.get_estimated_cost()], 'r^',
+ [best_optimization.execution_time_ms],
+ [best_optimization.execution_plan.get_estimated_cost()], 'go')
+
+ file_name = f'imgs/query_{self.reported_queries_counter}{scale}.png'
+ plt.savefig(f"report/{self.start_date}/{file_name}", dpi=300)
+ plt.close()
+
+ return file_name
+
+ def add_query(self, query: Type[Query], pg: Type[Query] | None):
+ if query.tag not in self.queries:
+ self.queries[query.tag] = [[query, pg], ]
+ else:
+ self.queries[query.tag].append([query, pg])
+
+ def build_report(self):
+ script, div = self.create_default_query_plots_interactive()
+ self.content += f"""
+++++
+
+
+{script}
+{div}
+++++
+"""
+
+ self.content += "\n== QO score\n"
+
+ yb_bests = 0
+ pg_bests = 0
+ qe_default_geo = []
+ qe_bests_geo = []
+ qo_yb_bests_geo = []
+ qo_pg_bests_geo = []
+ timed_out = 0
+ slower_then_10x = 0
+ best_slower_then_10x = 0
+ inconsistent_results = 0
+ total = 0
+
+ for queries in self.queries.values():
+ for query in queries:
+ yb_query = query[0]
+ pg_query = query[1]
+
+ yb_best = yb_query.get_best_optimization(self.config)
+ pg_best = pg_query.get_best_optimization(self.config)
+
+ inconsistent_results += 1 if yb_query.get_inconsistent_results() else 0
+
+ pg_success = pg_query.execution_time_ms > 0
+ yb_success = yb_query.execution_time_ms > 0
+
+ qe_default_geo.append(yb_query.execution_time_ms / pg_query.execution_time_ms
+ if pg_success and yb_success else 1)
+ qe_bests_geo.append(yb_best.execution_time_ms / pg_best.execution_time_ms
+ if pg_success and yb_success else 1)
+
+ if yb_query.execution_time_ms > 0 and yb_best.execution_time_ms > 0:
+ qo_yb_bests_geo.append(yb_query.execution_time_ms / yb_best.execution_time_ms)
+ if pg_query.execution_time_ms > 0 and pg_best.execution_time_ms > 0:
+ qo_pg_bests_geo.append(pg_query.execution_time_ms / pg_best.execution_time_ms)
+
+ yb_bests += 1 if yb_query.compare_plans(yb_best) else 0
+ pg_bests += 1 if pg_success and pg_query.compare_plans(pg_best) else 0
+ timed_out += 1 if yb_query.execution_time_ms == -1 else 0
+ slower_then_10x += 1 if pg_query.execution_time_ms and \
+ (yb_query.execution_time_ms / pg_query.execution_time_ms) >= 10 else 0
+ best_slower_then_10x += 1 if pg_query.execution_time_ms and \
+ (yb_best.execution_time_ms / pg_query.execution_time_ms) >= 10 else 0
+
+ total += 1
+
+ self.start_table("4,1,1")
+ self.content += "|Statistic|YB|PG\n"
+ self.content += f"|Best execution plan picked|{'{:.2f}'.format(float(yb_bests) * 100 / total)}%" \
+ f"|{'{:.2f}'.format(float(pg_bests) * 100 / total)}%\n"
+ self.content += f"|Geometric mean QE default\n2+m|{'{:.2f}'.format(self.geo_mean(qe_default_geo))}\n"
+ self.content += f"|Geometric mean QE best\n2+m|{'{:.2f}'.format(self.geo_mean(qe_bests_geo))}\n"
+ self.content += f"|Geometric mean QO default vs best|{'{:.2f}'.format(self.geo_mean(qo_yb_bests_geo))}" \
+ f"|{'{:.2f}'.format(self.geo_mean(qo_pg_bests_geo))}\n"
+ self.content += f"|% Queries > 10x: YB default vs PG default\n" \
+ f"2+m|{slower_then_10x}/{total} (+{timed_out} timed out)\n"
+ self.content += f"|% Queries > 10x: YB best vs PG default\n2+m|{best_slower_then_10x}/{total}\n"
+ self.end_table()
+
+ self.content += "\n[#top]\n== QE score\n"
+
+ num_columns = 7
+ for tag, queries in self.queries.items():
+ self.start_table("1,1,1,1,1,1,4")
+ self.content += "|YB|YB Best|PG|PG Best|Ratio YB vs PG|Ratio Best YB vs PG|Query\n"
+ self.content += f"{num_columns}+m|{tag}.sql\n"
+ for query in queries:
+ yb_query = query[0]
+ pg_query = query[1]
+
+ yb_best = yb_query.get_best_optimization(self.config)
+ pg_best = pg_query.get_best_optimization(self.config)
+
+ pg_success = pg_query.execution_time_ms > 0
+
+ default_yb_equality = "[green]" if yb_query.compare_plans(yb_best) else "[red]"
+ default_pg_equality = "[green]" \
+ if pg_success and pg_query.compare_plans(pg_best) else "[red]"
+
+ best_yb_pg_equality = "(eq) " if yb_best.compare_plans(pg_best) else ""
+
+ ratio_x3 = yb_query.execution_time_ms / (3 * pg_query.execution_time_ms) \
+ if yb_best.execution_time_ms > 0 and pg_success else 99999999
+ ratio_x3_str = "{:.2f}".format(yb_query.execution_time_ms / pg_query.execution_time_ms
+ if yb_best.execution_time_ms > 0 and pg_success else 99999999)
+ ratio_color = "[green]" if ratio_x3 <= 1.0 else "[red]"
+
+ ratio_best = yb_best.execution_time_ms / (3 * pg_best.execution_time_ms) \
+ if yb_best.execution_time_ms > 0 and pg_success else 99999999
+ ratio_best_x3_str = "{:.2f}".format(yb_best.execution_time_ms / pg_best.execution_time_ms
+ if yb_best.execution_time_ms > 0 and pg_success else 99999999)
+ ratio_best_color = "[green]" if ratio_best <= 1.0 else "[red]"
+
+ bitmap_flag = "[blue]" \
+ if pg_success and "bitmap" in pg_query.execution_plan.full_str.lower() else "[black]"
+
+ self.content += f"a|[black]#*{'{:.2f}'.format(yb_query.execution_time_ms)}*#\n" \
+ f"a|{default_yb_equality}#*{'{:.2f}'.format(yb_best.execution_time_ms)}*#\n" \
+ f"a|{bitmap_flag}#*{'{:.2f}'.format(pg_query.execution_time_ms)}*#\n" \
+ f"a|{default_pg_equality}#*{'{:.2f}'.format(pg_best.execution_time_ms)}*#\n" \
+ f"a|{ratio_color}#*{ratio_x3_str}*#\n" \
+ f"a|{ratio_best_color}#*{best_yb_pg_equality}{ratio_best_x3_str}*#\n"
+
+ self.content += f"a|[#{yb_query.query_hash}_top]"
+ self.append_tag_page_link(tag, yb_query.query_hash, f"Query {yb_query.query_hash}")
+
+ self.start_source(["sql"])
+ self.content += format_sql(pg_query.get_reportable_query())
+ self.end_source()
+ self.content += "\n"
+ self.end_table_row()
+
+ self.end_table()
+
+ # different results links
+ for tag in self.queries.keys():
+ self.append_tag_page_link(tag, None, f"{tag} queries file")
+
+ for tag, queries in self.queries.items():
+ sub_report = self.create_sub_report(tag)
+ sub_report.content += f"\n[#{tag}]\n== {tag} queries file\n\n"
+ for query in queries:
+ self.__report_query(sub_report, query[0], query[1], True)
+
+ def __report_near_queries(self, report, query: Type[Query]):
+ if query.optimizations:
+ best_optimization = query.get_best_optimization(self.config)
+ if add_to_report := "".join(
+ f"`{optimization.explain_hints}`\n\n"
+ for optimization in query.optimizations
+ if allowed_diff(self.config, best_optimization.execution_time_ms,
+ optimization.execution_time_ms)):
+ report.start_collapsible("Near best optimization hints")
+ report.content += add_to_report
+ report.end_collapsible()
+
+ def build_xls_report(self):
+ import xlsxwriter
+
+ workbook = xlsxwriter.Workbook(f'report/{self.start_date}/report_score.xls')
+ worksheet = workbook.add_worksheet()
+
+ head_format = workbook.add_format()
+ head_format.set_bold()
+ head_format.set_bg_color('#999999')
+
+ eq_format = workbook.add_format()
+ eq_format.set_bold()
+ eq_format.set_bg_color('#d9ead3')
+
+ eq_bad_format = workbook.add_format()
+ eq_bad_format.set_bold()
+ eq_bad_format.set_bg_color('#fff2cc')
+
+ eq_good_format = workbook.add_format()
+ eq_good_format.set_bold()
+ eq_good_format.set_bg_color('#d9ead3')
+
+ bm_format = workbook.add_format()
+ bm_format.set_bold()
+ bm_format.set_bg_color('#cfe2f3')
+
+ pg_comparison_format = workbook.add_format()
+ pg_comparison_format.set_bold()
+ pg_comparison_format.set_bg_color('#fce5cd')
+
+ # Start from the first cell. Rows and columns are zero indexed.
+ yb_bests = 0
+ pg_bests = 0
+ total = 0
+ for queries in self.queries.values():
+ for query in queries:
+ yb_query = query[0]
+ pg_query = query[1]
+
+ yb_best = yb_query.get_best_optimization(self.config, )
+ pg_best = pg_query.get_best_optimization(self.config, )
+
+ yb_bests += 1 if yb_query.compare_plans(yb_best) else 0
+ pg_bests += 1 if pg_query.compare_plans(pg_best) else 0
+
+ total += 1
+
+ worksheet.write(0, 0, "YB", head_format)
+ worksheet.write(0, 1, "YB Best", head_format)
+ worksheet.write(0, 2, "YB EQ", head_format)
+ worksheet.write(0, 3, "PG", head_format)
+ worksheet.write(0, 4, "PG Best", head_format)
+ worksheet.write(0, 5, "PG EQ", head_format)
+ worksheet.write(0, 6, "Ratio YB vs PG", head_format)
+ worksheet.write(0, 7, "Default EQ", head_format)
+ worksheet.write(0, 8, "Best YB vs PG", head_format)
+ worksheet.write(0, 9, "Best EQ", head_format)
+ worksheet.write(0, 10, "Query", head_format)
+ worksheet.write(0, 11, "Query Hash", head_format)
+ worksheet.write(0, 12, "YB Plan", head_format)
+ worksheet.write(0, 13, "YB Best Plan", head_format)
+ worksheet.write(0, 14, "PG Plan", head_format)
+ worksheet.write(0, 15, "PG Best Plan", head_format)
+
+ row = 1
+ # Iterate over the data and write it out row by row.
+ for tag, queries in self.queries.items():
+ for query in queries:
+ yb_query: PostgresQuery = query[0]
+ pg_query: PostgresQuery = query[1]
+
+ yb_best = yb_query.get_best_optimization(self.config, )
+ pg_best = pg_query.get_best_optimization(self.config, )
+
+ default_yb_equality = yb_query.compare_plans(yb_best)
+ default_pg_equality = pg_query.compare_plans(pg_best)
+
+ default_yb_pg_equality = yb_query.compare_plans(pg_query)
+ best_yb_pg_equality = yb_best.compare_plans(pg_best)
+
+ ratio_x3 = yb_query.execution_time_ms / (3 * pg_query.execution_time_ms) \
+ if pg_query.execution_time_ms > 0 else 99999999
+ ratio_x3_str = "{:.2f}".format(yb_query.execution_time_ms / pg_query.execution_time_ms
+ if pg_query.execution_time_ms > 0 else 99999999)
+ ratio_color = ratio_x3 > 1.0
+
+ ratio_best = yb_best.execution_time_ms / (3 * pg_best.execution_time_ms) \
+ if yb_best.execution_time_ms > 0 and pg_best.execution_time_ms > 0 else 99999999
+ ratio_best_x3_str = "{:.2f}".format(
+ yb_best.execution_time_ms / pg_best.execution_time_ms
+ if yb_best.execution_time_ms > 0 and pg_best.execution_time_ms > 0 else 99999999)
+ ratio_best_color = ratio_best > 1.0
+
+ bitmap_flag = pg_query.execution_plan and "bitmap" in pg_query.execution_plan.full_str.lower()
+
+ best_pg_format = None
+ if ratio_best_color and best_yb_pg_equality:
+ best_pg_format = eq_bad_format
+ elif best_yb_pg_equality:
+ best_pg_format = eq_good_format
+ elif ratio_best_color:
+ best_pg_format = pg_comparison_format
+
+ df_pf_format = None
+ if ratio_color and default_yb_pg_equality:
+ df_pf_format = eq_bad_format
+ elif default_yb_pg_equality:
+ df_pf_format = eq_good_format
+ elif ratio_color:
+ df_pf_format = pg_comparison_format
+
+ worksheet.write(row, 0, '{:.2f}'.format(yb_query.execution_time_ms))
+ worksheet.write(row, 1,
+ f"{'{:.2f}'.format(yb_best.execution_time_ms)}",
+ eq_format if default_yb_equality else None)
+ worksheet.write(row, 2, default_yb_equality)
+ worksheet.write(row, 3,
+ f"{'{:.2f}'.format(pg_query.execution_time_ms)}",
+ bm_format if bitmap_flag else None)
+ worksheet.write(row, 4,
+ f"{'{:.2f}'.format(pg_best.execution_time_ms)}",
+ eq_format if default_pg_equality else None)
+ worksheet.write(row, 5, default_pg_equality)
+ worksheet.write(row, 6, f"{ratio_x3_str}", df_pf_format)
+ worksheet.write(row, 7, default_yb_pg_equality)
+ worksheet.write(row, 8, f"{ratio_best_x3_str}", best_pg_format)
+ worksheet.write(row, 9, best_yb_pg_equality)
+ worksheet.write(row, 10, f'{format_sql(pg_query.query)}')
+ worksheet.write(row, 11, f'{pg_query.query_hash}')
+ worksheet.write(row, 12, f'{yb_query.execution_plan}')
+ worksheet.write(row, 13,
+ '<---' if default_yb_equality else f'{yb_best.execution_plan}')
+ worksheet.write(row, 14, f'{pg_query.execution_plan}')
+ worksheet.write(row, 15,
+ '<---' if default_pg_equality else f'{pg_best.execution_plan}')
+ row += 1
+
+ workbook.close()
+
+ def __report_heatmap(self, report, query: Type[Query]):
+ """
+ Here is the deal. In PG plans we can separate each plan tree node by splitting by `->`
+ When constructing heatmap need to add + or - to the beginning of string `\n`.
+ So there is 2 splitters - \n and -> and need to construct correct result.
+
+ :param query:
+ :return:
+ """
+ # TODO FIX THIS!!!!!
+ if not (execution_plan_heatmap := query.heatmap()):
+ return
+
+ best_decision = max(row['weight'] for row in execution_plan_heatmap.values())
+ last_rowid = max(execution_plan_heatmap.keys())
+ result = ""
+ for row_id, row in execution_plan_heatmap.items():
+ rows = row['str'].split("\n")
+
+ if row['weight'] == best_decision:
+ result = self.fix_last_newline_in_result(result, rows)
+ result += "\n".join([f"+{line}" for line_id, line in enumerate(rows)
+ if line_id != (len(rows) - 1)]) + f"\n{rows[-1]}"
+ elif row['weight'] == 0:
+ result = self.fix_last_newline_in_result(result, rows)
+ result += "\n".join([f"-{line}" for line_id, line in enumerate(rows)
+ if line_id != (len(rows) - 1)]) + f"\n{rows[-1]}"
+ else:
+ result += f"{row['str']}"
+
+ # skip adding extra -> to the end of list
+ if row_id != last_rowid:
+ result += "->"
+
+ report.start_collapsible("Plan heatmap")
+ report.start_source(["diff"])
+ report.content += result
+ report.end_source()
+ report.end_collapsible()
+
+ @staticmethod
+ def fix_last_newline_in_result(result, rows):
+ if result:
+ result, last_newline = result.rsplit("\n", 1)
+ rows[0] = f"{last_newline}{rows[0]}"
+ result += "\n"
+
+ return result
+
+ # noinspection InsecureHash
+ def __report_query(self, report, yb_query: Type[Query], pg_query: Type[Query], show_best: bool):
+ yb_best = yb_query.get_best_optimization(self.config)
+ inconsistencies = yb_query.get_inconsistent_results()
+
+ self.reported_queries_counter += 1
+
+ report.content += f"\n[#{yb_query.query_hash}]\n"
+ report.content += f"=== Query {yb_query.query_hash}"
+ report.content += f"\n{yb_query.tag}\n"
+ report.append_index_page_hashtag_link("top", "Go to index")
+ report.append_index_page_hashtag_link(f"{yb_query.query_hash}_top", "Show in summary")
+ report.add_double_newline()
+
+ report.start_source(["sql"])
+ report.content += format_sql(yb_query.get_reportable_query())
+ report.end_source()
+
+ analyze_execution_time = extract_execution_time_from_analyze(yb_query.execution_plan.full_str)
+ avg_execution_time = yb_query.execution_time_ms
+
+ if (analyze_execution_time > avg_execution_time and
+ not allowed_diff(self.config, avg_execution_time, analyze_execution_time)):
+ report.add_double_newline()
+ report.content += f"WARN! Analyze time is bigger than avg - `{analyze_execution_time}` > `{avg_execution_time}`"
+ report.add_double_newline()
+
+ if inconsistencies:
+ report.add_double_newline()
+ report.content += f"ERROR! YB Inconsistent hints - `{inconsistencies}`"
+ report.add_double_newline()
+
+ # TODO disabled by dev request since it is not accurate sometimes
+ # report.add_double_newline()
+ # report.content += f"YB Default explain hints - `{yb_query.explain_hints}`"
+ # report.add_double_newline()
+ #
+ # if show_best:
+ # report.add_double_newline()
+ # report.content += f"YB Best explain hints - `{yb_best.explain_hints}`"
+ # report.add_double_newline()
+ #
+ # self.__report_near_queries(report, yb_query)
+
+ report.start_table("2")
+ report.content += "|Default|Log scale\n"
+ query_plot = self.create_query_plot(yb_best, yb_query.optimizations, yb_query)
+ query_plot_log = self.create_query_plot(yb_best, yb_query.optimizations, yb_query, "log")
+ report.content += f"a|image::../{query_plot}[Default,align=\"center\"]\n"
+ report.content += f"a|image::../{query_plot_log}[Log scale,align=\"center\"]\n"
+ report.end_table()
+
+ report.add_double_newline()
+
+ report.add_double_newline()
+ default_yb_equality = "(eq) " if yb_query.compare_plans(yb_best) else ""
+ default_pg_equality = ""
+ default_yb_pg_equality = ""
+
+ best_yb_pg_equality = ""
+ if pg_query and pg_query.execution_time_ms > 0:
+ report.start_table("5")
+ report.content += "|Metric|YB|YB Best|PG|PG Best\n"
+
+ pg_best = pg_query.get_best_optimization(self.config)
+ default_pg_equality = "(eq) " if pg_query.compare_plans(pg_best) else ""
+ best_yb_pg_equality = "(eq) " if yb_best.compare_plans(pg_best) else ""
+ default_yb_pg_equality = "(eq) " if yb_query.compare_plans(pg_query) else ""
+
+ if 'order by' in yb_query.query:
+ report.start_table_row()
+ report.content += f"!! Result hash" \
+ f"|{yb_query.result_hash}" \
+ f"|{yb_best.result_hash}" \
+ f"|{pg_query.result_hash}" \
+ f"|{pg_best.result_hash}" \
+ if pg_query.result_hash != yb_query.result_hash else \
+ f"Result hash" \
+ f"|`{yb_query.result_hash}" \
+ f"|{yb_best.result_hash}" \
+ f"|{pg_query.result_hash}" \
+ f"|{pg_best.result_hash}"
+ report.end_table_row()
+
+ report.start_table_row()
+ report.content += f"Cardinality" \
+ f"|{yb_query.result_cardinality}" \
+ f"|{yb_best.result_cardinality}" \
+ f"|{pg_query.result_cardinality}" \
+ f"|{pg_best.result_cardinality}"
+ report.end_table_row()
+ report.start_table_row()
+ report.content += f"Estimated cost" \
+ f"|{yb_query.execution_plan.get_estimated_cost()}" \
+ f"|{default_yb_equality}{yb_best.execution_plan.get_estimated_cost()}" \
+ f"|{pg_query.execution_plan.get_estimated_cost()}" \
+ f"|{default_pg_equality}{pg_best.execution_plan.get_estimated_cost()}"
+ report.end_table_row()
+ report.start_table_row()
+ report.content += f"Execution time" \
+ f"|{'{:.2f}'.format(yb_query.execution_time_ms)}" \
+ f"|{default_yb_equality}{'{:.2f}'.format(yb_best.execution_time_ms)}" \
+ f"|{'{:.2f}'.format(pg_query.execution_time_ms)}" \
+ f"|{default_pg_equality}{'{:.2f}'.format(pg_best.execution_time_ms)}"
+ report.end_table_row()
+ # report.start_table_row()
+ # report.content += f"Server execution time" \
+ # f"|{'{:.2f}'.format(extract_execution_time_from_analyze(yb_query.execution_plan.full_str))}" \
+ # f"|{'{:.2f}'.format(extract_execution_time_from_analyze(yb_best.execution_plan.full_str))}" \
+ # f"|{'{:.2f}'.format(extract_execution_time_from_analyze(pg_query.execution_plan.full_str))}" \
+ # f"|{'{:.2f}'.format(extract_execution_time_from_analyze(pg_best.execution_plan.full_str))}"
+ # report.end_table_row()
+ # report.start_table_row()
+ # report.content += f"Result collect time" \
+ # f"|{'{:.2f}'.format(calculate_client_execution_time(yb_query))}" \
+ # f"|{'{:.2f}'.format(calculate_client_execution_time(yb_best))}" \
+ # f"|{'{:.2f}'.format(calculate_client_execution_time(pg_query))}" \
+ # f"|{'{:.2f}'.format(calculate_client_execution_time(pg_best))}"
+ # report.end_table_row()
+ else:
+ report.start_table("3")
+ report.content += "|Metric|YB|YB Best\n"
+
+ if yb_best.result_hash != yb_query.result_hash:
+ report.content += f"!! Result hash|{yb_query.result_hash}|{yb_best.result_hash}"
+ else:
+ report.content += f"Result hash|{yb_query.result_hash}|{yb_best.result_hash}"
+ report.end_table_row()
+
+ report.start_table_row()
+ report.content += f"Cardinality" \
+ f"|{yb_query.result_cardinality}" \
+ f"|{yb_best.result_cardinality}"
+ report.end_table_row()
+ report.start_table_row()
+ report.content += f"Optimizer cost" \
+ f"|{yb_query.execution_plan.get_estimated_cost()}" \
+ f"|{default_yb_equality}{yb_best.execution_plan.get_estimated_cost()}"
+ report.end_table_row()
+ report.start_table_row()
+ report.content += f"Execution time" \
+ f"|{yb_query.execution_time_ms}" \
+ f"|{default_yb_equality}{yb_best.execution_time_ms}"
+ report.end_table_row()
+ report.end_table()
+
+ report.start_table()
+ report.start_table_row()
+
+ if yb_query.query_stats:
+ report.start_collapsible("YB stats default")
+ report.start_source()
+ report.content += str(yb_query.query_stats)
+ report.end_source()
+ report.end_collapsible()
+
+ if yb_best.query_stats and not yb_query.compare_plans(yb_best):
+ report.start_collapsible("YB stats best")
+ report.start_source()
+ report.content += str(yb_best.query_stats)
+ report.end_source()
+ report.end_collapsible()
+
+ if pg_query and pg_query.execution_time_ms > 0:
+ bitmap_used = "(bm) " if "bitmap" in pg_query.execution_plan.full_str.lower() else ""
+ report.start_collapsible(f"{bitmap_used}PG plan")
+ report.start_source(["diff"])
+ report.content += pg_query.execution_plan.full_str
+ report.end_source()
+ report.end_collapsible()
+
+ pg_best = pg_query.get_best_optimization(self.config)
+ bitmap_used = "(bm) " if "bitmap" in pg_best.execution_plan.full_str.lower() else ""
+ report.start_collapsible(f"{default_pg_equality}{bitmap_used}PG best")
+ report.start_source(["diff"])
+ report.content += pg_best.execution_plan.full_str
+ report.end_source()
+ report.end_collapsible()
+
+ report.start_collapsible(f"{default_yb_pg_equality}PG default vs YB default")
+ report.start_source(["diff"])
+
+ # postgres plan should be red
+ report.content += get_plan_diff(
+ pg_query.execution_plan.full_str.replace("|", "\|"),
+ yb_query.execution_plan.full_str.replace("|", "\|"),
+ )
+ report.end_source()
+ report.end_collapsible()
+
+ report.start_collapsible(f"{best_yb_pg_equality}PG best vs YB best")
+ report.start_source(["diff"])
+ report.content += get_plan_diff(
+ pg_best.execution_plan.full_str.replace("|", "\|"),
+ yb_best.execution_plan.full_str.replace("|", "\|"),
+ )
+ report.end_source()
+ report.end_collapsible()
+
+ if show_best:
+ pass
+ # self.__report_heatmap(report, yb_query)
+
+ report.start_collapsible("YB default plan")
+ report.start_source(["diff"])
+ report.content += yb_query.execution_plan.full_str.replace("|", "\|")
+ report.end_source()
+ report.end_collapsible()
+
+ report.start_collapsible(f"{default_yb_equality}YB best plan")
+ report.start_source(["diff"])
+ report.content += yb_best.execution_plan.full_str.replace("|", "\|")
+ report.end_source()
+ report.end_collapsible()
+
+ report.content += f"{default_yb_equality}YB default vs YB best\n"
+ report.start_source(["diff"])
+ diff = get_plan_diff(
+ yb_query.execution_plan.full_str.replace("|", "\|"),
+ yb_best.execution_plan.full_str.replace("|", "\|")
+ )
+ if not diff:
+ diff = yb_query.execution_plan.full_str.replace("|", "\|")
+
+ report.content += diff
+ report.end_source()
+ report.end_table_row()
+
+ report.content += "\n"
+
+ report.end_table()
+
+ report.add_double_newline()
diff --git a/src/actions/reports/score_stats.py b/src/actions/reports/score_stats.py
new file mode 100644
index 00000000..a8e2f54f
--- /dev/null
+++ b/src/actions/reports/score_stats.py
@@ -0,0 +1,103 @@
+import json
+from typing import Type
+
+from actions.report import AbstractReportAction
+from collect import CollectResult
+from objects import Query
+
+
+class ScoreStatsReport(AbstractReportAction):
+
+ def __init__(self):
+ super().__init__(False)
+
+ self.queries = {}
+ self.json = {}
+
+ @classmethod
+ def generate_report(cls, loq: CollectResult, pg_loq: CollectResult = None):
+ report = ScoreStatsReport()
+
+ for query in loq.queries:
+ report.add_query(query, pg_loq.find_query_by_hash(query.query_hash) if pg_loq else None)
+
+ report.build_report(loq)
+ report.dump_json()
+
+ def add_query(self, query: Type[Query], pg: Type[Query] | None):
+ if query.tag not in self.queries:
+ self.queries[query.tag] = [[query, pg], ]
+ else:
+ self.queries[query.tag].append([query, pg])
+
+ def build_report(self, loq):
+ yb_bests = 0
+ pg_bests = 0
+ qe_default_geo = []
+ qe_bests_geo = []
+ qo_yb_bests_geo = []
+ qo_pg_bests_geo = []
+ timed_out = 0
+ slower_then_10x = 0
+ best_slower_then_10x = 0
+ inconsistent_results = 0
+ total = 0
+
+ try:
+ for queries in self.queries.values():
+ for query in queries:
+ try:
+ yb_query = query[0]
+ pg_query = query[1]
+
+ yb_best = yb_query.get_best_optimization(self.config)
+ pg_best = pg_query.get_best_optimization(self.config)
+
+ inconsistent_results += 1 if yb_query.get_inconsistent_results() else 0
+
+ pg_success = pg_query.execution_time_ms > 0
+ yb_success = yb_query.execution_time_ms > 0
+
+ qe_default_geo.append(yb_query.execution_time_ms / pg_query.execution_time_ms
+ if pg_success and yb_success else 1)
+ qe_bests_geo.append(yb_best.execution_time_ms / pg_best.execution_time_ms
+ if pg_success and yb_success else 1)
+
+ if yb_query.execution_time_ms > 0 and yb_best.execution_time_ms > 0:
+ qo_yb_bests_geo.append(yb_query.execution_time_ms / yb_best.execution_time_ms)
+ if pg_query.execution_time_ms > 0 and pg_best.execution_time_ms > 0:
+ qo_pg_bests_geo.append(pg_query.execution_time_ms / pg_best.execution_time_ms)
+
+ yb_bests += 1 if yb_query.compare_plans(yb_best) else 0
+ pg_bests += 1 if pg_success and pg_query.compare_plans(pg_best) else 0
+ timed_out += 1 if yb_query.execution_time_ms == -1 else 0
+ slower_then_10x += 1 if pg_query.execution_time_ms and \
+ (yb_query.execution_time_ms / pg_query.execution_time_ms) >= 10 else 0
+ best_slower_then_10x += 1 if pg_query.execution_time_ms and \
+ (yb_best.execution_time_ms / pg_query.execution_time_ms) >= 10 else 0
+ except Exception as e:
+ pass
+
+ total += 1
+ except Exception as e:
+ self.logger.exception(e)
+
+ self.json = {
+ "best_picked": '{:.2f}'.format(float(yb_bests) * 100 / total),
+ "qe_default": '{:.2f}'.format(self.geo_mean(qe_default_geo)),
+ "qe_best": '{:.2f}'.format(self.geo_mean(qe_bests_geo)),
+ "qo_default_vs_best": '{:.2f}'.format(self.geo_mean(qo_yb_bests_geo)),
+
+ "total": total,
+ "timeout": timed_out,
+ "more_10x_default_vs_default": slower_then_10x,
+ "more_10x_best_vs_default": best_slower_then_10x,
+
+ "version": loq.db_version,
+ "commit": loq.git_message,
+ "ddl_time": loq.ddl_execution_time,
+ "model_time": loq.model_execution_time,
+ }
+
+ def dump_json(self):
+ self.logger.info(f"Result: {json.dumps(self.json)}")
diff --git a/src/actions/reports/selectivity.py b/src/actions/reports/selectivity.py
new file mode 100644
index 00000000..43a2d48d
--- /dev/null
+++ b/src/actions/reports/selectivity.py
@@ -0,0 +1,196 @@
+from sql_formatter.core import format_sql
+
+from collect import CollectResult
+from objects import Query
+from actions.report import AbstractReportAction
+from utils import allowed_diff, get_plan_diff
+
+
+class SelectivityReport(AbstractReportAction):
+ def __init__(self):
+ super().__init__()
+
+ self.different_explain_plans = []
+ self.same_execution_plan = []
+ self.almost_same_execution_time = []
+ self.improved_execution_time = []
+ self.worse_execution_time = []
+
+ def get_report_name(self):
+ return "Default/Analyze/Analyze+Statistics"
+
+ @classmethod
+ def generate_report(cls,
+ loq_default: CollectResult,
+ loq_default_analyze: CollectResult,
+ loq_ta: CollectResult,
+ loq_ta_analyze: CollectResult,
+ loq_stats: CollectResult,
+ loq_stats_analyze: CollectResult):
+ report = SelectivityReport()
+
+ report.report_model(loq_default.model_queries)
+
+ for query in zip(loq_default.queries,
+ loq_default_analyze.queries,
+ loq_ta.queries,
+ loq_ta_analyze.queries,
+ loq_stats.queries,
+ loq_stats_analyze.queries):
+ report.add_query(*query)
+
+ report.build_report()
+ report.publish_report("sltvty")
+
+ def add_query(self,
+ default: Query,
+ default_analyze: Query,
+ ta: Query,
+ ta_analyze: Query,
+ stats: Query,
+ stats_analyze: Query
+ ):
+ queries_tuple = [default, default_analyze, ta, ta_analyze, stats, stats_analyze]
+ if not default.compare_plans(default_analyze) or \
+ not ta.compare_plans(ta_analyze) or \
+ not stats.compare_plans(stats_analyze):
+ self.different_explain_plans.append(queries_tuple)
+
+ if default.compare_plans(stats_analyze):
+ self.same_execution_plan.append(queries_tuple)
+ elif allowed_diff(self.config, default.execution_time_ms, stats_analyze.execution_time_ms):
+ self.almost_same_execution_time.append(queries_tuple)
+ elif default.execution_time_ms < stats_analyze.execution_time_ms:
+ self.worse_execution_time.append(queries_tuple)
+ else:
+ self.improved_execution_time.append(queries_tuple)
+
+ def build_report(self):
+ # link to top
+ self.content += "\n[#top]\n== All results by analysis type\n"
+ # different results links
+ self.content += "\n<>\n"
+ self.content += "\n<>\n"
+ self.content += "\n<>\n"
+ self.content += "\n<>\n"
+ self.content += "\n<>\n"
+
+ self.content += f"\n[#error]\n== ERROR: Different EXPLAIN and EXPLAIN ANALYZE plans ({len(self.different_explain_plans)})\n\n"
+ for query in self.different_explain_plans:
+ self.__report_query(*query)
+
+ self.content += f"\n[#worse]\n== Worse execution time queries ({len(self.worse_execution_time)})\n\n"
+ for query in self.worse_execution_time:
+ self.__report_query(*query)
+
+ self.content += f"\n[#same_time]\n== Almost same execution time queries ({len(self.almost_same_execution_time)})\n\n"
+ for query in self.almost_same_execution_time:
+ self.__report_query(*query)
+
+ self.content += f"\n[#improved]\n== Improved execution time ({len(self.improved_execution_time)})\n\n"
+ for query in self.improved_execution_time:
+ self.__report_query(*query)
+
+ self.content += f"\n[#same_plan]\n\n== Same execution plan ({len(self.same_execution_plan)})\n\n"
+ for query in self.same_execution_plan:
+ self.__report_query(*query)
+
+ # noinspection InsecureHash
+ def __report_query(self,
+ default: Query,
+ default_analyze: Query,
+ analyze: Query,
+ analyze_analyze: Query,
+ all: Query,
+ all_analyze: Query):
+ self.reported_queries_counter += 1
+
+ self.content += f"=== Query {default.query_hash}"
+ self.content += f"\n{default.tag}\n"
+ self.content += "\n<>\n"
+ self.add_double_newline()
+
+ self.start_source(["sql"])
+ self.content += format_sql(default.query.replace("|", "\|"))
+ self.end_source()
+
+ self.add_double_newline()
+
+ self.start_table("7")
+ self.content += "|Metric|Default|Default+QA|TA|TA + QA|S+TA|S+TA+QA\n"
+ self.start_table_row()
+ self.content += f"Cardinality|{default.result_cardinality}|{default_analyze.result_cardinality}|" \
+ f"{analyze.result_cardinality}|{analyze_analyze.result_cardinality}|" \
+ f"{all.result_cardinality}|{all_analyze.result_cardinality}"
+ self.end_table_row()
+ self.start_table_row()
+ self.content += f"Optimizer cost|{default.execution_plan.get_estimated_cost()}|{default_analyze.execution_plan.get_estimated_cost()}|" \
+ f"{analyze.execution_plan.get_estimated_cost()}|{analyze_analyze.execution_plan.get_estimated_cost()}|" \
+ f"{all.execution_plan.get_estimated_cost()}|{all_analyze.execution_plan.get_estimated_cost()}"
+ self.end_table_row()
+ self.start_table_row()
+ self.content += f"Execution time|{default.execution_time_ms}|{default_analyze.execution_time_ms}|" \
+ f"{analyze.execution_time_ms}|{analyze_analyze.execution_time_ms}|" \
+ f"{all.execution_time_ms}|{all_analyze.execution_time_ms}"
+ self.end_table_row()
+ self.end_table()
+
+ self.start_table()
+
+ self.start_table_row()
+
+ self.start_collapsible("Default approach plan (w/o analyze)")
+ self.start_source(["diff"])
+ self.content += default.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
+
+ self.start_collapsible("Default approach plan with EXPLAIN ANALYZE (w/o analyze)")
+ self.start_source(["diff"])
+ self.content += default_analyze.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
+
+ self.start_collapsible("Plan with analyzed table (w/ analyze)")
+ self.start_source(["diff"])
+ self.content += analyze.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
+
+ self.start_collapsible("Plan with analyzed table with EXPLAIN ANALYZE (w/ analyze)")
+ self.start_source(["diff"])
+ self.content += analyze_analyze.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
+
+ self.start_collapsible("Stats + table analyze (w/ analyze and statistics)")
+ self.start_source(["diff"])
+ self.content += all.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
+
+ self.start_collapsible(
+ "Stats + table analyze with EXPLAIN ANALYZE (w/ analyze and statistics)")
+ self.start_source(["diff"])
+ self.content += all_analyze.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
+
+ self.start_source(["diff"])
+
+ diff = get_plan_diff(
+ default.execution_plan.full_str,
+ all_analyze.execution_plan.full_str
+ )
+ if not diff:
+ diff = default.execution_plan.full_str
+
+ self.content += diff
+ self.end_source()
+ self.end_table_row()
+
+ self.content += "\n"
+
+ self.end_table()
+
+ self.add_double_newline()
diff --git a/src/reports/adoc/taqo.py b/src/actions/reports/taqo.py
similarity index 53%
rename from src/reports/adoc/taqo.py
rename to src/actions/reports/taqo.py
index e5e1ef87..e1deeca6 100644
--- a/src/reports/adoc/taqo.py
+++ b/src/actions/reports/taqo.py
@@ -1,20 +1,18 @@
-import os
from typing import Type
from matplotlib import pyplot as plt
from sql_formatter.core import format_sql
-from objects import ListOfQueries, Query
-from reports.abstract import Report
-from utils import allowed_diff
+from collect import CollectResult
+from objects import Query
+from actions.report import AbstractReportAction
+from utils import allowed_diff, get_plan_diff
-class TaqoReport(Report):
+class TaqoReport(AbstractReportAction):
def __init__(self):
super().__init__()
- os.mkdir(f"report/{self.start_date}/imgs")
-
self.logger.info(f"Created report folder for this run at 'report/{self.start_date}'")
self.failed_validation = []
@@ -22,10 +20,13 @@ def __init__(self):
self.better_plan_found = []
@classmethod
- def generate_report(cls, loq: ListOfQueries, pg_loq: ListOfQueries = None):
+ def generate_report(cls, loq: CollectResult, pg_loq: CollectResult = None):
report = TaqoReport()
report.define_version(loq.db_version)
+ report.report_config(loq.config, "YB")
+ if pg_loq:
+ report.report_config(pg_loq.config, "PG")
report.report_model(loq.model_queries)
for qid, query in enumerate(loq.queries):
@@ -38,7 +39,7 @@ def get_report_name(self):
return "TAQO"
def define_version(self, version):
- self.report += f"[VERSION]\n====\n{version}\n====\n\n"
+ self.content += f"[VERSION]\n====\n{version}\n====\n\n"
def calculate_score(self, query):
if query.execution_time_ms == 0:
@@ -51,8 +52,8 @@ def create_plot(self, best_optimization, optimizations, query):
plt.xlabel('Execution time')
plt.ylabel('Optimizer cost')
- plt.plot([q.execution_time_ms for q in optimizations if q.execution_time_ms != 0],
- [q.execution_plan.get_estimated_cost() for q in optimizations if q.execution_time_ms != 0], 'k.',
+ plt.plot([q.execution_time_ms for q in optimizations if q.execution_time_ms > 0],
+ [q.execution_plan.get_estimated_cost() for q in optimizations if q.execution_time_ms > 0], 'k.',
[query.execution_time_ms],
[query.execution_plan.get_estimated_cost()], 'r^',
[best_optimization.execution_time_ms],
@@ -83,21 +84,21 @@ def add_query(self, query: Type[Query], pg: Type[Query] | None):
def build_report(self):
# link to top
- self.report += "\n[#top]\n== All results by analysis type\n"
+ self.content += "\n[#top]\n== All results by analysis type\n"
# different results links
- self.report += "\n<>\n"
- self.report += "\n<>\n"
- self.report += "\n<>\n"
+ self.content += "\n<>\n"
+ self.content += "\n<>\n"
+ self.content += "\n<>\n"
- self.report += f"\n[#result]\n== Result validation failure ({len(self.failed_validation)})\n\n"
+ self.content += f"\n[#result]\n== Result validation failure ({len(self.failed_validation)})\n\n"
for query in self.failed_validation:
self.__report_query(query[0], query[1], True)
- self.report += f"\n[#better]\n== Better plan found queries ({len(self.better_plan_found)})\n\n"
+ self.content += f"\n[#better]\n== Better plan found queries ({len(self.better_plan_found)})\n\n"
for query in self.better_plan_found:
self.__report_query(query[0], query[1], True)
- self.report += f"\n[#found]\n== No better plan found ({len(self.same_execution_plan)})\n\n"
+ self.content += f"\n[#found]\n== No better plan found ({len(self.same_execution_plan)})\n\n"
for query in self.same_execution_plan:
self.__report_query(query[0], query[1], False)
@@ -108,9 +109,9 @@ def __report_near_queries(self, query: Query):
for optimization in query.optimizations
if allowed_diff(self.config, best_optimization.execution_time_ms,
optimization.execution_time_ms)):
- self._start_collapsible("All best optimization hints")
- self.report += add_to_report
- self._end_collapsible()
+ self.start_collapsible("All best optimization hints")
+ self.content += add_to_report
+ self.end_collapsible()
def __report_heatmap(self, query: Query):
"""
@@ -143,11 +144,11 @@ def __report_heatmap(self, query: Query):
if row_id != last_rowid:
result += "->"
- self._start_collapsible("Plan heatmap")
- self._start_source(["diff"])
- self.report += result
- self._end_source()
- self._end_collapsible()
+ self.start_collapsible("Plan heatmap")
+ self.start_source(["diff"])
+ self.content += result
+ self.end_source()
+ self.end_collapsible()
@staticmethod
def fix_last_newline_in_result(result, rows):
@@ -165,125 +166,131 @@ def __report_query(self, query: Query, pg_query: Query, show_best: bool):
self.reported_queries_counter += 1
- self.report += f"=== Query {query.query_hash} " \
- f"(Optimizer efficiency - {self.calculate_score(query)})"
- self.report += "\n<>\n"
- self._add_double_newline()
+ self.content += f"=== Query {query.query_hash} " \
+ f"(Optimizer efficiency - {self.calculate_score(query)})"
+ self.content += "\n<>\n"
+ self.add_double_newline()
- self._start_source(["sql"])
- self.report += format_sql(query.query.replace("|", "\|"))
- self._end_source()
+ self.start_source(["sql"])
+ self.content += format_sql(query.query.replace("|", "\|"))
+ self.end_source()
- self._add_double_newline()
- self.report += f"Default explain hints - `{query.explain_hints}`"
- self._add_double_newline()
+ self.add_double_newline()
+ self.content += f"Default explain hints - `{query.explain_hints}`"
+ self.add_double_newline()
if show_best:
- self._add_double_newline()
- self.report += f"Better explain hints - `{best_optimization.explain_hints}`"
- self._add_double_newline()
+ self.add_double_newline()
+ self.content += f"Better explain hints - `{best_optimization.explain_hints}`"
+ self.add_double_newline()
self.__report_near_queries(query)
filename = self.create_plot(best_optimization, query.optimizations, query)
- self.report += f"image::{filename}[\"Query {self.reported_queries_counter}\"]"
+ self.content += f"image::{filename}[\"Query {self.reported_queries_counter}\"]"
- self._add_double_newline()
+ self.add_double_newline()
- self._start_table("3")
- self.report += "|Metric|Default|Best\n"
+ self.start_table("3")
+ self.content += "|Metric|Default|Best\n"
if 'order by' in query.query:
- self._start_table_row()
+ self.start_table_row()
if self.config.compare_with_pg:
- self.report += \
+ self.content += \
f"!! Result hash|{query.result_hash}|{best_optimization.result_hash} (yb) != {pg_query.result_hash} (pg)" \
- if pg_query.result_hash != query.result_hash else \
- f"Result hash|`{query.result_hash}|{best_optimization.result_hash} (yb) != {pg_query.result_hash} (pg)"
+ if pg_query.result_hash != query.result_hash else \
+ f"Result hash|`{query.result_hash}|{best_optimization.result_hash} (yb) != {pg_query.result_hash} (pg)"
elif best_optimization.result_hash != query.result_hash:
- self.report += f"!! Result hash|{query.result_hash}|{best_optimization.result_hash}"
+ self.content += f"!! Result hash|{query.result_hash}|{best_optimization.result_hash}"
else:
- self.report += f"Result hash|{query.result_hash}|{best_optimization.result_hash}"
- self._end_table_row()
-
- self._start_table_row()
- self.report += f"Cardinality|{query.result_cardinality}|{best_optimization.result_cardinality}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Optimizer cost|{query.execution_plan.get_estimated_cost()}|{best_optimization.execution_plan.get_estimated_cost()}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Execution time|{query.execution_time_ms}|{best_optimization.execution_time_ms}"
- self._end_table_row()
- self._end_table()
-
- self._start_table()
- self._start_table_row()
+ self.content += f"Result hash|{query.result_hash}|{best_optimization.result_hash}"
+ self.end_table_row()
+
+ self.start_table_row()
+ self.content += f"Cardinality|{query.result_cardinality}|{best_optimization.result_cardinality}"
+ self.end_table_row()
+ self.start_table_row()
+ self.content += f"Optimizer cost|{query.execution_plan.get_estimated_cost()}|{best_optimization.execution_plan.get_estimated_cost()}"
+ self.end_table_row()
+ self.start_table_row()
+ self.content += f"Execution time|{query.execution_time_ms}|{best_optimization.execution_time_ms}"
+ self.end_table_row()
+ self.end_table()
+
+ self.start_table()
+ self.start_table_row()
if pg_query:
bitmap_used = "!!! bitmap !!!" if "bitmap" in pg_query.execution_plan.full_str.lower() else ""
- self._start_collapsible(f"Postgres plan {bitmap_used}")
- self._start_source(["diff"])
- self.report += pg_query.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible("Postgres plan diff")
- self._start_source(["diff"])
+ self.start_collapsible(f"Postgres plan {bitmap_used}")
+ self.start_source(["diff"])
+ self.content += pg_query.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
+
+ self.start_collapsible("Postgres plan diff")
+ self.start_source(["diff"])
# postgres plan should be red
- self.report += self._get_plan_diff(pg_query.execution_plan.full_str,
- query.execution_plan.full_str, )
- self._end_source()
- self._end_collapsible()
+ self.content += get_plan_diff(pg_query.execution_plan.full_str,
+ query.execution_plan.full_str, )
+ self.end_source()
+ self.end_collapsible()
best_pg = pg_query.get_best_optimization(self.config)
- self._start_collapsible("Best Postgres plan")
- self._start_source(["diff"])
- self.report += best_pg.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible("Best Postgres plan diff with YB default")
- self._start_source(["diff"])
- self.report += self._get_plan_diff(best_pg.execution_plan.full_str,
- query.execution_plan.full_str, )
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible("Best Postgres plan diff with YB best")
- self._start_source(["diff"])
- self.report += self._get_plan_diff(best_pg.execution_plan.full_str,
- best_optimization.execution_plan.full_str, )
- self._end_source()
- self._end_collapsible()
+ self.start_collapsible("Best Postgres plan")
+ self.start_source(["diff"])
+ self.content += best_pg.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
+
+ self.start_collapsible("Best Postgres plan diff with YB default")
+ self.start_source(["diff"])
+ self.content += get_plan_diff(
+ query.execution_plan.full_str,
+ best_pg.execution_plan.full_str,
+ )
+ self.end_source()
+ self.end_collapsible()
+
+ self.start_collapsible("Best Postgres plan diff with YB best")
+ self.start_source(["diff"])
+ self.content += get_plan_diff(
+ best_pg.execution_plan.full_str,
+ best_optimization.execution_plan.full_str,
+ )
+ self.end_source()
+ self.end_collapsible()
if show_best:
self.__report_heatmap(query)
- self._start_collapsible("Original plan")
- self._start_source(["diff"])
- self.report += query.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
+ self.start_collapsible("Original plan")
+ self.start_source(["diff"])
+ self.content += query.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
- self._start_collapsible("Best plan")
- self._start_source(["diff"])
- self.report += best_optimization.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
+ self.start_collapsible("Best plan")
+ self.start_source(["diff"])
+ self.content += best_optimization.execution_plan.full_str
+ self.end_source()
+ self.end_collapsible()
- self._start_source(["diff"])
+ self.start_source(["diff"])
- diff = self._get_plan_diff(query.execution_plan.full_str,
- best_optimization.execution_plan.full_str)
+ diff = get_plan_diff(
+ query.execution_plan.full_str,
+ best_optimization.execution_plan.full_str
+ )
if not diff:
diff = query.execution_plan.full_str
- self.report += diff
- self._end_source()
- self._end_table_row()
+ self.content += diff
+ self.end_source()
+ self.end_table_row()
- self.report += "\n"
+ self.content += "\n"
- self._end_table()
+ self.end_table()
- self._add_double_newline()
+ self.add_double_newline()
diff --git a/src/collect.py b/src/collect.py
new file mode 100644
index 00000000..460a3451
--- /dev/null
+++ b/src/collect.py
@@ -0,0 +1,60 @@
+import dataclasses
+import json
+import os
+from typing import List, Type
+
+from dacite import Config as DaciteConfig
+from dacite import from_dict
+
+from objects import Query
+
+
+@dataclasses.dataclass
+class CollectResult:
+ db_version: str = ""
+ git_message: str = ""
+ ddl_execution_time: int = 0
+ model_execution_time: int = 0
+ config: str = ""
+ database_config: str = ""
+ model_queries: List[str] = None
+ queries: List[Type[Query]] = None
+
+ def append(self, new_element):
+ if not self.queries:
+ self.queries = [new_element, ]
+ else:
+ self.queries.append(new_element)
+
+ # CPUs are cheap
+ self.queries.sort(key=lambda q: q.query_hash)
+
+ def find_query_by_hash(self, query_hash) -> Type[Query] | None:
+ return next(
+ (query for query in self.queries if query.query_hash == query_hash),
+ None,
+ )
+
+
+class EnhancedJSONEncoder(json.JSONEncoder):
+ def default(self, o):
+ if dataclasses.is_dataclass(o):
+ return dataclasses.asdict(o)
+ return super().default(o)
+
+
+class ResultsLoader:
+
+ def __init__(self):
+ self.clazz = CollectResult
+
+ def get_queries_from_previous_result(self, previous_execution_path):
+ with open(previous_execution_path, "r") as prev_result:
+ return from_dict(self.clazz, json.load(prev_result), DaciteConfig(check_types=False))
+
+ def store_queries_to_file(self, queries: Type[CollectResult], output_json_name: str):
+ if not os.path.isdir("report"):
+ os.mkdir("report")
+
+ with open(f"report/{output_json_name}.json", "w") as result_file:
+ result_file.write(json.dumps(queries, cls=EnhancedJSONEncoder))
diff --git a/src/config.py b/src/config.py
index 55de6f9a..8cccc035 100644
--- a/src/config.py
+++ b/src/config.py
@@ -1,6 +1,8 @@
import dataclasses
import logging
import sys
+import pprint
+from copy import copy
from enum import Enum
from typing import List, Set
@@ -35,7 +37,8 @@ class DDLStep(Enum):
CREATE = 1
ANALYZE = 2
IMPORT = 3
- DROP = 4
+ COMPACT = 4
+ DROP = 5
@dataclasses.dataclass
@@ -47,13 +50,17 @@ class ConnectionConfig:
database: str = None
def __str__(self):
- return f"{self.host}:{self.port}@{self.username}:{self.password}, database '{self.database}'"
+ return f"{self.host}:{self.port}@{self.username}:*******, database '{self.database}'"
@dataclasses.dataclass
class Config(metaclass=Singleton):
logger: logging.Logger = None
+ exit_on_fail: bool = False
+ has_failures: bool = False
+ has_warnings: bool = False
+
database: Database = None
remote_data_path: str = "."
@@ -71,18 +78,24 @@ class Config(metaclass=Singleton):
connection: ConnectionConfig = None
compare_with_pg: bool = False
- enable_statistics: bool = False
explain_clause: str = ""
+ server_side_execution: bool = False
session_props: List[str] = None
test: str = None
model: str = None
+ baseline_path: str = None
+ baseline_results: any = None
+ all_index_check: bool = None
+ bitmap_enabled: bool = None
+ load_catalog_tables: bool = None
basic_multiplier: int = None
ddls: Set[DDLStep] = None
clean_db: bool = None
allow_destroy_db: bool = None
clean_build: bool = None
+ colocated_database: bool = None
skip_percentage_delta: float = None
look_near_best_plan: bool = None
@@ -93,47 +106,24 @@ class Config(metaclass=Singleton):
skip_timeout_delta: int = None
ddl_query_timeout: int = None
test_query_timeout: int = None
+ compaction_timeout: int = None
all_pairs_threshold: int = None
+ yugabyte_bin_path: str = None
+ yugabyte_collect_stats: bool = True
+ yugabyte_master_addresses: str = None
+
asciidoctor_path: str = None
clear: bool = False
def __str__(self):
- return "Configuration" + \
- f"DB - {self.database.__class__.__name__}\n" \
- f"\n" \
- f"remote_data_path - {self.remote_data_path}\n" \
- f"ddl_prefix - {self.ddl_prefix}\n" \
- f"with_optimizations - {self.with_optimizations}\n" \
- f"source_path - {self.source_path}\n" \
- f"output - {self.output}\n" \
- f"\n" \
- f"revision - {self.revision}\n" \
- f"num_nodes - {self.num_nodes}\n" \
- f"tserver_flags - {self.tserver_flags}\n" \
- f"master_flags - {self.master_flags}\n" \
- f"\n" \
- f"(initial) connection - {self.connection}\n" \
- f"enable_statistics - {self.enable_statistics}\n" \
- f"explain_clause - {self.explain_clause}\n" \
- f"session_props - {self.session_props}\n" \
- f"\n" \
- f"test - {self.test}\n" \
- f"model - {self.model}\n" \
- f"basic_multiplier - {self.basic_multiplier}\n" \
- f"ddls - {[m.name for m in self.ddls]}\n" \
- f"clean_db - {self.clean_db}\n" \
- f"allow_destroy_db - {self.allow_destroy_db}\n" \
- f"clean_build - {self.clean_build}\n" \
- f"skip_percentage_delta - {self.skip_percentage_delta}\n" \
- f"look_near_best_plan - {self.look_near_best_plan}\n" \
- f"num_queries - {self.num_queries}\n" \
- f"parametrized - {self.parametrized}\n" \
- f"num_retries - {self.num_retries}\n" \
- f"num_warmup - {self.num_warmup}\n" \
- f"skip_timeout_delta - {self.skip_timeout_delta}\n" \
- f"ddl_query_timeout - {self.ddl_query_timeout}\n" \
- f"test_query_timeout - {self.test_query_timeout}\n" \
- f"all_pairs_threshold - {self.all_pairs_threshold}\n" \
- f"asciidoctor_path - {self.asciidoctor_path}\n" \
- f"clear - {self.clear}\n"
+ skipped_fields = ['logger', 'database', 'baseline_results', 'has_failures', 'exit_on_fail']
+
+ self_dict = copy(vars(self))
+ for field in skipped_fields:
+ self_dict.pop(field)
+
+ self_dict['connection'] = str(self_dict['connection'])
+ self_dict['ddls'] = str([m.name for m in self.ddls])
+
+ return str(pprint.pformat(self_dict))
diff --git a/src/db/abstract.py b/src/db/abstract.py
new file mode 100644
index 00000000..eb6d00db
--- /dev/null
+++ b/src/db/abstract.py
@@ -0,0 +1,66 @@
+from abc import ABC, abstractmethod
+
+
+class PlanNodeAccessor(ABC):
+ @staticmethod
+ @abstractmethod
+ def has_valid_cost(node):
+ pass
+
+ @staticmethod
+ @abstractmethod
+ def fixup_invalid_cost(node):
+ pass
+
+ # ScanNode methods
+
+ @staticmethod
+ @abstractmethod
+ def is_seq_scan(node):
+ pass
+
+ @staticmethod
+ @abstractmethod
+ def is_index_scan(node):
+ pass
+
+ @staticmethod
+ @abstractmethod
+ def is_index_only_scan(node):
+ pass
+
+ @staticmethod
+ @abstractmethod
+ def get_index_cond(node, with_label=False):
+ pass
+
+ @staticmethod
+ @abstractmethod
+ def may_have_table_fetch_by_rowid(node):
+ pass
+
+ @staticmethod
+ @abstractmethod
+ def get_remote_filter(node, with_label=False):
+ pass
+
+ # Table Fetch By Rowid
+ @staticmethod
+ @abstractmethod
+ def get_remote_tfbr_filter(node, with_label=False):
+ pass
+
+ @staticmethod
+ @abstractmethod
+ def get_local_filter(node, with_label=False):
+ pass
+
+ @staticmethod
+ @abstractmethod
+ def get_rows_removed_by_recheck(node, with_label=False):
+ pass
+
+ @staticmethod
+ @abstractmethod
+ def is_scan_with_partial_aggregate(node):
+ pass
diff --git a/src/db/database.py b/src/db/database.py
index 25b4a9f3..f3456536 100644
--- a/src/db/database.py
+++ b/src/db/database.py
@@ -1,16 +1,31 @@
+from psycopg2._psycopg import cursor
+
+
class Database:
def __init__(self, config):
self.config = config
self.logger = self.config.logger
self.connection = None
+ def run_compaction(self, tables: list[str]):
+ pass
+
+ def establish_connection(self, database: str):
+ pass
+
+ def get_list_queries(self):
+ pass
+
def change_version_and_compile(self, revision_or_path: str = None):
pass
def create_test_database(self):
pass
- def prepare_query_execution(self, cur):
+ def drop_test_database(self):
+ pass
+
+ def prepare_query_execution(self, cur, query):
pass
def set_query_timeout(self, cur, timeout):
@@ -35,4 +50,16 @@ def get_execution_plan(self, execution_plan: str):
pass
def get_results_loader(self):
- pass
\ No newline at end of file
+ pass
+
+ def reset_query_statics(self, cur: cursor):
+ pass
+
+ def collect_query_statistics(self, cur: cursor, query, query_str: str):
+ pass
+
+ def get_revision_version(self, cur: cursor):
+ pass
+
+ def get_database_config(self, cur: cursor):
+ return ""
diff --git a/src/db/postgres.py b/src/db/postgres.py
index 25e0d6fb..55396d80 100644
--- a/src/db/postgres.py
+++ b/src/db/postgres.py
@@ -1,5 +1,6 @@
import dataclasses
import itertools
+import requests
import re
from difflib import SequenceMatcher
from enum import Enum
@@ -7,12 +8,15 @@
import psycopg2
from allpairspy import AllPairs
+from psycopg2._psycopg import cursor
+from collect import CollectResult, ResultsLoader
from config import Config, ConnectionConfig, DDLStep
-from objects import Query, EPNode, ExecutionPlan, ListOfOptimizations, Table, Optimization, \
- ListOfQueries, ResultsLoader
+from objects import Query, ExecutionPlan, ListOfOptimizations, Table, Optimization, ExplainFlags
+from objects import AggregateNode, JoinNode, SortNode, PlanNode, ScanNode
+from db.abstract import PlanNodeAccessor
from db.database import Database
-from utils import evaluate_sql, allowed_diff
+from utils import evaluate_sql, allowed_diff, parse_clear_and_parametrized_sql
DEFAULT_USERNAME = 'postgres'
DEFAULT_PASSWORD = 'postgres'
@@ -24,15 +28,48 @@
PLAN_CLEANUP_REGEX = r"\s\(actual time.*\)|\s\(never executed\)|\s\(cost.*\)|" \
r"\sMemory:.*|Planning Time.*|Execution Time.*|Peak Memory Usage.*|" \
+ r"Storage Read Requests:.*|Storage Read Execution Time:.*|Storage Write Requests:.*|" \
+ r"Catalog Read Requests:.*|Catalog Read Execution Time:.*|Catalog Write Requests:.*|" \
+ r"Catalog Reads Requests:.*|Catalog Reads Execution Time:.*|Catalog Writes Requests:.*|" \
+ r"Storage Flushes Requests:.*|Storage Execution Time:.*|" \
+ r"Storage Table Read Requests:.*|Storage Table Read Execution Time:.*|Output:.*|" \
+ r"Storage Index Read Requests:.*|Storage Index Read Execution Time:.*|" \
+ r"Storage Flush Requests:.*|" \
+ r"Disk:.*|" \
+ r"Metric rocksdb_.*:.*|" \
r"Read RPC Count:.*|Read RPC Wait Time:.*|DocDB Scanned Rows:.*|" \
r".*Partial Aggregate:.*|YB\s|Remote\s|" \
- r"JIT:.*|\s+Functions:.*|\s+Options:.*|\s+Timing:.*" # PG14 JIT info
+ r"JIT:.*|\s+Functions:.*|\s+Options:.*|\s+Timing:.*"
PLAN_RPC_CALLS = r"\nRead RPC Count:\s(\d+)"
PLAN_RPC_WAIT_TIMES = r"\nRead RPC Wait Time:\s([+-]?([0-9]*[.])?[0-9]+)"
PLAN_DOCDB_SCANNED_ROWS = r"\nDocDB Scanned Rows:\s(\d+)"
PLAN_PEAK_MEMORY = r"\nPeak memory:\s(\d+)"
PLAN_TREE_CLEANUP = r"\n\s*->\s*|\n\s*"
+plan_node_header_pattern = re.compile(''.join([
+ r'(?P\S+(?:\s+\S+)*)',
+ r'\s+',
+ r'\(cost=(?P\d+\.\d*)\.\.(?P\d+\.\d*)\s+rows=(?P\d+)\s+width=(?P\d+)\)',
+ r'\s+',
+ r'\((?:(?:actual time=(?P\d+\.\d*)\.\.(?P\d+\.\d*) +rows=(?P\d+)',
+ r' +loops=(?P\d+))|(?:(?Pnever executed)))\)',
+]))
+
+node_name_decomposition_pattern = re.compile(''.join([
+ r'(?PParallel )*(?PDistinct )*(?P\S+(?:\s+\S+)* Scan)(?P\s+Backward)*'
+ r'(?: using (?P\S+))*'
+ r' on (?:(?P\S+)\.)*(?P\S+)(?: (?P\S+))*']))
+
+hash_property_decomposition_pattern = re.compile(''.join([
+ r'Buckets: (?P\d+)(?: originally (?P\d+))* ',
+ r'Batches: (?P\d+)(?: originally (?P\d+))* ',
+ r'Memory Usage: (?P\d+)kB',
+]))
+
+PG_DISABLE_COST = 10000000000.00
+
+VERSION = r"PostgreSQL\s+(\d+\.\d+)"
+
class Postgres(Database):
@@ -47,21 +84,41 @@ def establish_connection(self, database: str = "postgres"):
self.connection.connect()
- def prepare_query_execution(self, cur):
+ def prepare_query_execution(self, cur, query_object):
for query in self.config.session_props:
evaluate_sql(cur, query)
+ for query in query_object.optimizer_tips.debug_queries:
+ evaluate_sql(cur, query)
+
def create_test_database(self):
if DDLStep.DATABASE in self.config.ddls:
self.establish_connection("postgres")
conn = self.connection.conn
try:
with conn.cursor() as cur:
- colocated = "" if self.config.ddl_prefix else " WITH COLOCATED = true"
- evaluate_sql(cur, f'CREATE DATABASE {self.config.connection.database}{colocated};')
+ evaluate_sql(cur, f'CREATE DATABASE {self.config.connection.database};')
except Exception as e:
self.logger.exception(f"Failed to create testing database {e}")
+ def drop_test_database(self):
+ try:
+ self.connection.conn.close()
+ except Exception as e:
+ self.logger.exception(f"Failed to close testing database connection {e}")
+
+ self.establish_connection("postgres")
+ conn = self.connection.conn
+ try:
+ with conn.cursor() as cur:
+ evaluate_sql(cur, f'DROP DATABASE {self.config.connection.database};')
+ except Exception as e:
+ self.logger.exception(f"Failed to drop testing database {e}")
+
+ def set_query_timeout(self, cur, timeout):
+ self.logger.debug(f"Setting statement timeout to {timeout} seconds")
+ evaluate_sql(cur, f"SET statement_timeout = '{timeout}s'")
+
def get_list_optimizations(self, original_query):
return PGListOfOptimizations(
self.config, original_query).get_all_optimizations()
@@ -73,7 +130,16 @@ def get_results_loader(self):
return PostgresResultsLoader()
def get_list_queries(self):
- return PostgresListOfQueries()
+ return PostgresCollectResult()
+
+ def get_revision_version(self, cur: cursor):
+ evaluate_sql(cur, 'SELECT VERSION();')
+ version = re.findall(VERSION, cur.fetchone()[0], re.MULTILINE)
+
+ if version:
+ return "PG", version[0]
+ else:
+ return "UNKNOWN_VERSION"
class Connection:
@@ -91,23 +157,19 @@ def connect(self):
password=self.connection_config.password)
self.conn.autocommit = True
- def get_version(self):
- with self.conn.cursor() as cur:
- evaluate_sql(cur, 'SELECT VERSION();')
- return cur.fetchone()[0]
-
class Scans(Enum):
SEQ = "SeqScan"
INDEX = "IndexScan"
INDEX_ONLY = "IndexOnlyScan"
- # BITMAP = "BitmapScan"
+ BITMAP = "BitmapScan"
class Joins(Enum):
HASH = "HashJoin", "Hash"
MERGE = "MergeJoin", "Merge"
NESTED_LOOP = "NestLoop", "Nested Loop"
+ YB_NESTED_LOOP = "YbBatchedNL", "YB Nested Loop"
def construct(self, tables: List[str]):
return f"{self.value[0]}({' '.join(tables)})"
@@ -116,42 +178,25 @@ def construct(self, tables: List[str]):
class Leading:
LEADING = "Leading"
- def __init__(self, config: Config, alias_to_table: List[Table]):
+ def __init__(self, config: Config, tables: List[Table]):
self.config = config
- self.alias_to_table = alias_to_table
+ self.tables = tables
self.joins = []
self.table_scan_hints = []
def construct(self):
if self.config.all_pairs_threshold == -1:
self.get_all_combinations()
- elif len(self.alias_to_table) < self.config.all_pairs_threshold:
+ elif len(self.tables) < self.config.all_pairs_threshold:
self.get_all_combinations()
+ elif len(self.tables) == self.config.all_pairs_threshold:
+ self.get_all_pairs_with_all_table_permutations()
else:
self.get_all_pairs_combinations()
- def filtered_permutations(self, tables):
- # todo check how it works
- perms = list(itertools.permutations(tables))
-
- if len(tables) < self.config.all_pairs_threshold:
- return perms
-
- combs = list(itertools.combinations(tables, len(tables) - 1))
-
- result = []
- for perm in perms:
- perm_join = "".join([table.name for table in perm])
- for comb in combs:
- comb_join = "".join([table.name for table in comb])
- if comb_join in perm_join:
- result.append(perm)
-
- return result
-
def get_all_combinations(self):
# algorithm with all possible combinations
- for tables_perm in itertools.permutations(self.alias_to_table):
+ for tables_perm in itertools.permutations(self.tables):
prev_el = None
joins = []
joined_tables = []
@@ -175,26 +220,16 @@ def get_all_combinations(self):
for join in joins:
self.joins.append(f"{self.LEADING} ( {prev_el} ) {join}")
- for table in self.alias_to_table:
- tables_and_idxs = list({f"{Scans.INDEX.value}({table.alias})"
- for field in table.fields if field.is_index})
- tables_and_idxs += {f"{Scans.INDEX_ONLY.value}({table.alias})"
- for field in table.fields if field.is_index}
- tables_and_idxs.append(f"{Scans.SEQ.value}({table.alias})")
- self.table_scan_hints.append(tables_and_idxs)
-
- def get_all_pairs_combinations(self):
- if len(self.alias_to_table) <= 1:
- return
+ self.table_scan_hints = itertools.product(*self.get_table_scan_hints())
- # todo to reduce number of pairs combinations used here
- # while its not produce overwhelming amount of optimizations
- # it should provide enough number of combinations
- table_combinations = list(self.filtered_permutations(self.alias_to_table))
- join_product = list(AllPairs([list(Joins) for _ in range(len(self.alias_to_table) - 1)]))
- scan_product = list(AllPairs([list(Scans) for _ in range(len(self.alias_to_table))]))
+ def get_all_pairs_with_all_table_permutations(self):
+ # algorithm with all possible table permutations
+ # but with all pairs scans
+ table_permutations = list(itertools.permutations(self.tables))
+ join_product = list(AllPairs([list(Joins) for _ in range(len(self.tables) - 1)]))
+ scan_product = list(AllPairs(self.get_table_scan_hints()))
- for tables, joins, scans in AllPairs([table_combinations, join_product, scan_product]):
+ for tables, joins, scans in AllPairs([table_permutations, join_product, scan_product]):
prev_el = None
joins = itertools.cycle(joins)
query_joins = ""
@@ -208,49 +243,98 @@ def get_all_pairs_combinations(self):
query_joins += f" {next(joins).construct(joined_tables)}"
leading_hint = f"{self.LEADING} ({prev_el})"
- scan_hints = " ".join(
- f"{scan.value}({tables[table_idx].alias})" for table_idx, scan in
- enumerate(scans))
+ scan_hints = " ".join(scans)
self.joins.append(f"{leading_hint} {query_joins} {scan_hints}")
+ def get_all_pairs_combinations(self):
+ if len(self.tables) <= 1:
+ return
+
+ self.table_scan_hints = list(AllPairs(self.get_table_scan_hints()))
+
+ def get_table_scan_hints(self):
+ table_scan_hints = []
+ for table in self.tables:
+ tables_and_idxs = {f"{Scans.SEQ.value}({table.alias})",
+ f"{Scans.INDEX.value}({table.alias})",
+ f"{Scans.INDEX_ONLY.value}({table.alias})"}
+ if self.config.bitmap_enabled:
+ tables_and_idxs |= {f"{Scans.BITMAP.value}({table.alias})", }
+
+ if self.config.all_index_check:
+ indexes = []
+ for field in table.fields:
+ if field.is_index:
+ indexes += field.indexes
+
+ tables_and_idxs |= {
+ f"{Scans.INDEX.value}({table.alias} {index})"
+ for index in indexes
+ }
+ tables_and_idxs |= {
+ f"{Scans.INDEX_ONLY.value}({table.alias} {index})"
+ for index in indexes
+ }
+ if self.config.bitmap_enabled:
+ tables_and_idxs |= {
+ f"{Scans.BITMAP.value}({table.alias} {index})"
+ for index in indexes
+ }
+ else:
+ tables_and_idxs |= {
+ f"{Scans.INDEX.value}({table.alias})"
+ for field in table.fields
+ if field.is_index
+ }
+ tables_and_idxs |= {
+ f"{Scans.INDEX_ONLY.value}({table.alias})"
+ for field in table.fields
+ if field.is_index
+ }
+ if self.config.bitmap_enabled:
+ tables_and_idxs |= {
+ f"{Scans.BITMAP.value}({table.alias})"
+ for field in table.fields
+ if field.is_index
+ }
+
+ table_scan_hints.append(list(tables_and_idxs))
+
+ return table_scan_hints
+
@dataclasses.dataclass
class PostgresQuery(Query):
execution_plan: 'PostgresExecutionPlan' = None
optimizations: List['PostgresOptimization'] = None
- def get_query(self):
- return self.query
+ def get_debug_hints(self):
+ return f"/*+ {self.optimizer_tips.debug_hints} */ " if self.optimizer_tips.debug_hints else ""
- def get_explain(self):
- return f"{Config().explain_clause} {self.query}"
-
- def get_heuristic_explain(self):
- return f"EXPLAIN {self.query}"
-
- def get_explain_analyze(self):
- return f"EXPLAIN ANALYZE {self.query}"
-
- def tips_looks_fair(self, optimization):
- clean_plan = self.execution_plan.get_clean_plan()
-
- return not any(
- join.value[0] in optimization.explain_hints and join.value[1] not in clean_plan
- for join in Joins)
-
- def compare_plans(self, execution_plan: Type['ExecutionPlan']):
- if clean_plan := self.execution_plan.get_clean_plan():
- return clean_plan == self.execution_plan.get_clean_plan(execution_plan)
+ def get_query(self):
+ return f"{self.get_debug_hints()}{self.query}"
+
+ def compare_plans(self, query: Type['Query']):
+ if (self.cost_off_explain and self.cost_off_explain.is_present() and
+ query.cost_off_explain and query.cost_off_explain.is_present()):
+ return self.cost_off_explain == query.cost_off_explain
+ elif self.execution_plan and self.execution_plan.is_present():
+ return self.execution_plan.get_clean_plan() == self.execution_plan.get_clean_plan(query.execution_plan)
else:
return False
+ def get_reportable_query(self):
+ _, _, sql_wo_parameters = parse_clear_and_parametrized_sql(self.query.replace("|", "\|"))
+ return sql_wo_parameters
+
def __str__(self):
return f"Query - \"{self.query}\"\n" \
f"Tables - \"{self.tables}\"\n" \
f"Optimization hints - \"{self.explain_hints}\"\n" \
f"Execution plan - \"{self.execution_plan}\"\n" \
- f"Execution time - \"{self.execution_time_ms}\""
+ f"Execution (COST OFF) plan - \"{self.cost_off_explain}\"\n" \
+ f"Execution time (ms) - \"{self.execution_time_ms}\""
def heatmap(self):
config = Config()
@@ -267,10 +351,8 @@ def heatmap(self):
for plan_line in plan_heatmap.values():
for optimization_line in no_cost_plan.split("->"):
if SequenceMatcher(
- a=optimization.execution_plan.get_no_tree_plan_str(
- plan_line['str']),
- b=optimization.execution_plan.get_no_tree_plan_str(
- optimization_line)
+ a=optimization.execution_plan.get_no_tree_plan_str(plan_line['str']),
+ b=optimization.execution_plan.get_no_tree_plan_str(optimization_line)
).ratio() > 0.9:
plan_line['weight'] += 1
@@ -281,10 +363,10 @@ def heatmap(self):
def get_best_optimization(self, config):
best_optimization = self
if best_optimization.optimizations:
- for optimization in best_optimization.optimizations:
- if best_optimization.execution_time_ms < 0:
- best_optimization = optimization
- elif 0 < optimization.execution_time_ms < best_optimization.execution_time_ms:
+ for optimization in self.optimizations:
+ best_execution_time = best_optimization.execution_time_ms \
+ if best_optimization.execution_time_ms != -1 else 99999999
+ if 0 < optimization.execution_time_ms < best_execution_time:
best_optimization = optimization
if allowed_diff(config, best_optimization.execution_time_ms, self.execution_time_ms):
@@ -292,48 +374,211 @@ def get_best_optimization(self, config):
return best_optimization
+ def get_inconsistent_results(self):
+ if not self.result_hash or "skip_consistency_check" in self.optimizer_tips.tags or not self.optimizations:
+ return []
+
+ return [optimization.explain_hints for optimization in self.optimizations
+ if optimization.result_hash and self.result_hash != optimization.result_hash]
+
@dataclasses.dataclass
class PostgresOptimization(PostgresQuery, Optimization):
execution_plan: 'PostgresExecutionPlan' = None
+ def get_default_tipped_query(self):
+ return f"/*+ {self.optimizer_tips.debug_hints} {self.explain_hints} */ {self.query}"
+
def get_query(self):
- return f"/*+ {self.explain_hints} */ {self.query}"
+ return self.get_default_tipped_query()
+
+ def get_explain(self, explain_clause: str = None, options: List[ExplainFlags] = None):
+ if not explain_clause:
+ explain_clause = Config().explain_clause
+
+ options_clause = f" ({', '.join([opt.value for opt in options])})" if options else ""
+
+ return f"{explain_clause}{options_clause} {self.get_default_tipped_query()}"
+
+
+class PostgresPlanNodeAccessor(PlanNodeAccessor):
+ @staticmethod
+ def has_valid_cost(node):
+ return float(node.total_cost) < PG_DISABLE_COST
+
+ @staticmethod
+ def fixup_invalid_cost(node):
+ from sys import float_info
+ from math import log
+ scost = float(node.startup_cost)
+ tcost = float(node.total_cost)
+ if ((scost > 0 and log(scost, 10) >= float_info.mant_dig - 1)
+ or (tcost > 0 and log(tcost, 10) >= float_info.mant_dig - 1)):
+ return True
+
+ node.startup_cost = round(scost % PG_DISABLE_COST, 3)
+ node.total_cost = round(tcost % PG_DISABLE_COST, 3)
+ return False
+
+ @staticmethod
+ def is_seq_scan(node):
+ return node.node_type == 'Seq Scan' or node.node_type == 'YB Seq Scan'
+
+ @staticmethod
+ def is_index_scan(node):
+ return node.node_type == 'Index Scan'
+
+ @staticmethod
+ def is_index_only_scan(node):
+ return node.node_type == 'Index Only Scan'
+
+ @staticmethod
+ def get_index_cond(node, with_label=False):
+ return node.get_property('Index Cond', with_label)
+
+ @staticmethod
+ def may_have_table_fetch_by_rowid(node):
+ return (PostgresPlanNodeAccessor.is_index_scan(node)
+ and not node.index_name.endswith('_pkey'))
+
+ @staticmethod
+ def get_remote_filter(node, with_label=False):
+ return node.get_property('Remote Index Filter'
+ if PostgresPlanNodeAccessor.may_have_table_fetch_by_rowid(node)
+ else 'Remote Filter', with_label)
+
+ @staticmethod
+ def get_remote_tfbr_filter(node, with_label=False):
+ return (node.get_property('Remote Filter', with_label)
+ if node.may_have_table_fetch_by_rowid() else '')
- def get_explain(self):
- return f"{Config().explain_clause} /*+ {self.explain_hints} */ {self.query}"
+ @staticmethod
+ def get_local_filter(node, with_label=False):
+ return node.get_property('Filter', with_label)
- def get_heuristic_explain(self):
- return f"EXPLAIN /*+ {self.explain_hints} */ {self.query}"
+ @staticmethod
+ def get_rows_removed_by_recheck(node, with_label=False):
+ return int(node.get_property('Rows Removed by Index Recheck', with_label)
+ or node.get_property('Rows Removed by Recheck', with_label)
+ or 0)
+
+ @staticmethod
+ def is_scan_with_partial_aggregate(node):
+ return bool(node.get_property('Partial Aggregate'))
@dataclasses.dataclass
class PostgresExecutionPlan(ExecutionPlan):
- full_str: str
-
- def parse_tree(self):
- root = EPNode()
- current_node = root
- for line in self.full_str.split("\n"):
- if line.strip().startswith("->"):
- level = int(line.find("->") / 2)
- previous_node = current_node
- current_node = EPNode()
- current_node.level = level
- current_node.full_str += line
-
- if previous_node.level <= current_node.level:
- previous_node.childs.append(current_node)
- current_node.root = previous_node
+ __node_accessor = PostgresPlanNodeAccessor()
+
+ def make_node(self, node_name):
+ index_name = table_name = table_alias = is_backward = is_parallel = None
+ if match := node_name_decomposition_pattern.search(node_name):
+ # strip off the schema name added by EXPLAIN VERBOSE
+ if schema_name := match.group('schema'):
+ node_name = node_name.replace(f" {schema_name}.", ' ')
+ node_type = match.group('type')
+ index_name = match.group('index')
+ is_parallel = match.group('parallel') is not None
+ is_distinct = match.group('distinct') is not None
+ is_backward = match.group('backward') is not None
+ table_name = match.group('table')
+ table_alias = match.group('alias')
+ if node_type.startswith('Bitmap Index Scan'):
+ index_name = table_name
+ table_name = self.get_table_name_from_index_name(index_name)
+ else:
+ node_type = node_name
+
+ if table_name:
+ return ScanNode(self.__node_accessor, node_type, node_name, table_name, table_alias,
+ index_name, is_backward, is_distinct, is_parallel)
+
+ if 'Join' in node_type or 'Nested Loop' in node_type:
+ return JoinNode(self.__node_accessor, node_type, node_name)
+
+ if 'Aggregate' in node_type or 'Group' in node_type:
+ return AggregateNode(self.__node_accessor, node_type, node_name)
+
+ if 'Sort' in node_type:
+ return SortNode(self.__node_accessor, node_type, node_name)
+
+ return PlanNode(self.__node_accessor, node_type, node_name)
+
+ def parse_plan(self):
+ node = None
+ prev_level = 0
+ current_path = []
+ for node_str in self.full_str.split('->'):
+ node_level = prev_level
+ # trailing spaces after the previous newline is the indent of the next node
+ node_end = node_str.rfind('\n')
+ indent = int(node_str.count(' ', node_end))
+ # postgres explain.c adds 6 whitespaces at each indentation level with " -> "
+ # for each node header. add back 4 for "-> " before division because we split
+ # it at each '->'.
+ prev_level = int((indent + 4) / 6)
+
+ node_props = (node_str[:node_end].splitlines() if node_str.endswith('\n')
+ else node_str.splitlines())
+
+ if not node_props:
+ break
+
+ if match := plan_node_header_pattern.search(node_props[0]):
+ node_name = match.group('name')
+ node = self.make_node(node_name)
+
+ is_scan_node = isinstance(node, ScanNode)
+ node.level = node_level
+ node.startup_cost = match.group('sc')
+ node.total_cost = match.group('tc')
+ node.plan_rows = match.group('prows')
+ node.plan_width = match.group('width')
+ if match.group('never'):
+ node.nloops = 0
else:
- walking_node = previous_node.root
- while walking_node.level != current_node.level:
- walking_node = walking_node.root
- walking_node = walking_node.root
- walking_node.childs.append(current_node)
- current_node.root = walking_node
+ node.startup_ms = match.group('st')
+ node.total_ms = match.group('tt')
+ node.rows = match.group('rows')
+ node.nloops = match.group('loops')
else:
- current_node.full_str += line
+ break
+
+ for prop in node_props[1:]:
+ if prop.startswith(' '):
+ prop_str = prop.strip()
+ if match := hash_property_decomposition_pattern.search(prop_str):
+ node.properties['Hash Buckets'] = match.group('buckets')
+
+ if orig_buckets := match.group('orig_buckets'):
+ node.properties['Original Hash Buckets'] = orig_buckets
+
+ node.properties['Hash Batches'] = match.group('batches')
+
+ if orig_batches := match.group('orig_batches'):
+ node.properties['Original Hash Batches'] = orig_batches
+
+ node.properties['Peak Memory Usage'] = match.group('peak_mem')
+ else:
+ if (keylen := prop_str.find(':')) > 0:
+ pkey = prop_str[:keylen]
+ pval = prop_str[keylen + 1:].strip()
+ # strip off the alias added by EXPLAIN VERBOSE
+ if (is_scan_node
+ and (pkey == 'Index Cond' or pkey.find('Filter') > 0)):
+ pval = pval.replace(f'{node.table_alias or node.table_name}.', '')
+
+ node.properties[pkey] = pval
+
+ if not current_path:
+ current_path.append(node)
+ else:
+ while len(current_path) > node.level:
+ current_path.pop()
+ current_path[-1].child_nodes.append(node)
+
+ return current_path[0] if current_path else None
def __cmp__(self, other):
if isinstance(other, str):
@@ -350,6 +595,7 @@ def get_estimated_cost(self):
re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
return float(match.groups()[0])
+ return 0
except Exception as e:
return 0
@@ -397,11 +643,17 @@ def get_no_tree_plan(self, execution_plan: 'PostgresExecutionPlan' = None):
def get_no_tree_plan_str(plan_str):
return re.sub(PLAN_TREE_CLEANUP, '\n', plan_str).strip()
- def get_clean_plan(self, execution_plan: Type['ExecutionPlan'] = None):
+ def get_clean_plan(self, execution_plan: ExecutionPlan = None):
no_tree_plan = re.sub(PLAN_TREE_CLEANUP, '\n',
execution_plan.full_str if execution_plan else self.full_str).strip()
return re.sub(PLAN_CLEANUP_REGEX, '', no_tree_plan).strip()
+ @staticmethod
+ def get_table_name_from_index_name(index_name):
+ # we could figure it out from the metadata, however, just assume
+ # "_..." naming convention for now.
+ return index_name[:index_name.find('_')]
+
@dataclasses.dataclass
class PGListOfOptimizations(ListOfOptimizations):
@@ -415,17 +667,12 @@ def __init__(self, config: Config, query: PostgresQuery):
def get_all_optimizations(self) -> List[Optimization]:
optimizations = []
for leading_join in self.leading.joins:
- for table_scan_hint in itertools.product(*self.leading.table_scan_hints):
- explain_hints = f"{leading_join} {' '.join(table_scan_hint)}"
-
- self.add_optimization(explain_hints, optimizations)
+ self.add_optimization(leading_join, optimizations)
if not optimizations and self.leading.table_scan_hints:
# case w/o any joins
- for table_scan_hint in itertools.product(*self.leading.table_scan_hints):
- explain_hints = f"{' '.join(table_scan_hint)}"
-
- self.add_optimization(explain_hints, optimizations)
+ for table_scan_hint in self.leading.table_scan_hints:
+ self.add_optimization(f"{' '.join(table_scan_hint)}", optimizations)
return optimizations
@@ -435,13 +682,15 @@ def add_optimization(self, explain_hints, optimizations):
optimizations.append(
PostgresOptimization(
query=self.query.query,
+ has_order_by=self.query.has_order_by,
query_hash=self.query.query_hash,
+ optimizer_tips=self.query.optimizer_tips,
explain_hints=explain_hints
)
)
-class PostgresListOfQueries(ListOfQueries):
+class PostgresCollectResult(CollectResult):
queries: List[PostgresQuery] = None
@@ -449,4 +698,4 @@ class PostgresResultsLoader(ResultsLoader):
def __init__(self):
super().__init__()
- self.clazz = PostgresListOfQueries
+ self.clazz = PostgresCollectResult
diff --git a/src/db/yugabyte.py b/src/db/yugabyte.py
index 206d5ea9..d7849ba0 100644
--- a/src/db/yugabyte.py
+++ b/src/db/yugabyte.py
@@ -5,28 +5,38 @@
from time import sleep
from typing import List
-from config import ConnectionConfig
+import requests
+from psycopg2._psycopg import cursor
+
+from collect import CollectResult, ResultsLoader
+from config import ConnectionConfig, DDLStep
from db.postgres import Postgres, PostgresExecutionPlan, PLAN_TREE_CLEANUP, PostgresQuery
-from objects import ExecutionPlan, ListOfQueries, ResultsLoader
-from utils import evaluate_sql
+from objects import ExecutionPlan, QueryStats, Query
+from utils import evaluate_sql, seconds_to_readable_minutes
DEFAULT_USERNAME = 'yugabyte'
DEFAULT_PASSWORD = 'yugabyte'
JDBC_STRING_PARSE = r'\/\/(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)):(\d+)\/([a-z]+)(\?user=([a-z]+)&password=([a-z]+))?'
-ENABLE_STATISTICS_HINT = "SET yb_enable_optimizer_statistics = true;"
-
PLAN_CLEANUP_REGEX = r"\s\(actual time.*\)|\s\(never executed\)|\s\(cost.*\)|" \
r"\sMemory:.*|Planning Time.*|Execution Time.*|Peak Memory Usage.*|" \
+ r"Storage Read Requests:.*|Storage Read Execution Time:.*|Storage Write Requests:.*|" \
+ r"Catalog Reads Requests:.*|Catalog Reads Execution Time:.*|Catalog Writes Requests:.*|" \
+ r"Storage Flushes Requests:.*|Storage Execution Time:.*|" \
+ r"Storage Table Read Requests:.*|Storage Table Read Execution Time:.*|Output:.*|" \
+ r"Storage Index Read Requests:.*|Storage Index Read Execution Time:.*|" \
+ r"Metric rocksdb_.*:.*|" \
r"Read RPC Count:.*|Read RPC Wait Time:.*|DocDB Scanned Rows:.*|" \
r".*Partial Aggregate:.*|YB\s|Remote\s|" \
- r"JIT:.*|\s+Functions:.*|\s+Options:.*|\s+Timing:.*" # PG14 JIT info
+ r"JIT:.*|\s+Functions:.*|\s+Options:.*|\s+Timing:.*"
PLAN_RPC_CALLS = r"\nRead RPC Count:\s(\d+)"
PLAN_RPC_WAIT_TIMES = r"\nRead RPC Wait Time:\s([+-]?([0-9]*[.])?[0-9]+)"
PLAN_DOCDB_SCANNED_ROWS = r"\nDocDB Scanned Rows:\s(\d+)"
PLAN_PEAK_MEMORY = r"\nPeak memory:\s(\d+)"
+VERSION = r"version\s((\d+\.\d+\.\d+\.\d+)\s+build\s+(\d+)\s+revision\s+([0-9a-z]+))"
+
def yb_db_factory(config):
if not config.revision:
@@ -38,6 +48,51 @@ def yb_db_factory(config):
class Yugabyte(Postgres):
+ def run_compaction(self, tables: list[str]):
+ tables_to_optimize = [tables[0], ] if self.config.colocated_database else tables
+
+ self.logger.info(f"Evaluating flush on tables {[table.name for table in tables_to_optimize]}")
+ for table in tables_to_optimize:
+ subprocess.call(f'./yb-admin -init_master_addrs {self.config.connection.host}:7100 '
+ f'flush_table ysql.{self.config.connection.database} {table.name}',
+ shell=True,
+ cwd=self.config.yugabyte_bin_path)
+
+ # Flush sys catalog tables
+ subprocess.call(f'./yb-admin -init_master_addrs {self.config.connection.host}:7100 '
+ f'flush_sys_catalog',
+ shell=True,
+ cwd=self.config.yugabyte_bin_path)
+
+ self.logger.info("Waiting for 2 minutes to operations to complete")
+ sleep(self.config.compaction_timeout)
+
+ self.logger.info(f"Evaluating compaction on system tables")
+ # Compact sys catalog tables
+ subprocess.call(f'./yb-admin -init_master_addrs {self.config.connection.host}:7100 '
+ f'compact_sys_catalog',
+ shell=True,
+ cwd=self.config.yugabyte_bin_path)
+
+ self.logger.info(f"Evaluating compaction on tables {[table.name for table in tables_to_optimize]}")
+ for table in tables_to_optimize:
+ retries = 1
+ while retries < 5:
+ try:
+ result = subprocess.check_output(
+ f'./yb-admin -init_master_addrs {self.config.connection.host}:7100 '
+ f'compact_table ysql.{self.config.connection.database} {table.name}',
+ shell=True,
+ cwd=self.config.yugabyte_bin_path)
+ self.logger.info(result)
+ break
+ except Exception as e:
+ retries += 1
+
+ self.logger.info(f"Waiting for {seconds_to_readable_minutes(self.config.compaction_timeout)} "
+ f"minutes to operations to complete for {table.name}")
+ sleep(self.config.compaction_timeout)
+
def establish_connection_from_output(self, out: str):
self.logger.info("Reinitializing connection based on cluster creation output")
parsing = re.findall(JDBC_STRING_PARSE, out)[0]
@@ -50,17 +105,19 @@ def establish_connection_from_output(self, out: str):
self.logger.info(f"Connection - {self.config.connection}")
- def prepare_query_execution(self, cur):
- super().prepare_query_execution(cur)
-
- if self.config.enable_statistics:
- self.logger.debug("Enable yb_enable_optimizer_statistics flag")
-
- evaluate_sql(cur, ENABLE_STATISTICS_HINT)
+ def create_test_database(self):
+ if DDLStep.DATABASE in self.config.ddls:
+ self.establish_connection("postgres")
+ conn = self.connection.conn
+ try:
+ with conn.cursor() as cur:
+ colocated = " WITH COLOCATED = true" if self.config.colocated_database else ""
+ evaluate_sql(cur, f'CREATE DATABASE {self.config.connection.database}{colocated};')
+ except Exception as e:
+ self.logger.exception(f"Failed to create testing database {e}")
- def set_query_timeout(self, cur, timeout):
- self.logger.debug(f"Setting statement timeout to {timeout} seconds")
- evaluate_sql(cur, f"SET statement_timeout = '{timeout}s'")
+ def prepare_query_execution(self, cur, query_object):
+ super().prepare_query_execution(cur, query_object)
def change_version_and_compile(self, revision_or_path=None):
pass
@@ -80,6 +137,74 @@ def call_upgrade_ysql(self):
def get_execution_plan(self, execution_plan: str):
return YugabyteExecutionPlan(execution_plan)
+ def reset_query_statics(self, cur: cursor):
+ evaluate_sql(cur, "SELECT pg_stat_statements_reset()")
+
+ def get_revision_version(self, cur: cursor):
+ model_result = ""
+ try:
+ model_result = re.findall(VERSION,
+ requests.get(f'http://{self.config.connection.host}:7000/tablet-servers').text,
+ re.MULTILINE)
+
+ if model_result:
+ version = f"{model_result[0][1]}-b{model_result[0][2]}"
+ revision = model_result[0][3]
+
+ return revision, version
+ except Exception:
+ self.logger.error(model_result)
+
+ return 'UNKNOWN', 'UNKNOWN'
+
+ def get_flags(self, port, prefix):
+ response = requests.get(f'http://{self.config.connection.host}:{port}/varz?raw')
+
+ if response.status_code == 200:
+ lines = response.text.splitlines()
+
+ processed_lines = []
+ for line in lines:
+ if '--' in line:
+ line = line.replace('Command-line Flags', '')
+ line = line.replace('--', f'{prefix}=', 1)
+ processed_lines.append(line)
+
+ processed_lines.sort()
+
+ return processed_lines
+
+ def get_database_config(self, cur: cursor):
+ try:
+ return '\n'.join(self.get_flags(7000, 'MASTER') + self.get_flags(9000, 'TSERVER'))
+ except Exception as e:
+ self.logger.error(e)
+
+ return ''
+
+ def collect_query_statistics(self, cur: cursor, query: Query, query_str: str):
+ try:
+ tuned_query = query_str.replace("'", "''")
+ evaluate_sql(cur,
+ "select query, calls, total_time, min_time, max_time, mean_time, rows, yb_latency_histogram "
+ f"from pg_stat_statements where query like '%{tuned_query}%';",
+ force_warning=True,
+ mute_exceptions=True)
+ result = cur.fetchall()
+
+ query.query_stats = QueryStats(
+ calls=result[0][1],
+ total_time=result[0][2],
+ min_time=result[0][3],
+ max_time=result[0][4],
+ mean_time=result[0][5],
+ rows=result[0][6],
+ latency=result[0][7],
+ )
+ except Exception:
+ # TODO do nothing
+ pass
+
class YugabyteQuery(PostgresQuery):
execution_plan: 'YugabyteExecutionPlan' = None
@@ -200,7 +325,7 @@ def call_upgrade_ysql(self):
self.logger.info("Calling upgrade_ysql and trying to upgrade metadata")
out = subprocess.check_output(
- ['bin/yb-admin', 'upgrade_ysql', '-master_addresses', f"{self.config.host}:7100"],
+ ['bin/yb-admin', 'upgrade_ysql', '-master_addresses', f"{self.config.yugabyte_master_addresses}:7100"],
stderr=subprocess.PIPE,
cwd=self.path, )
@@ -241,7 +366,7 @@ def change_version_and_compile(self, revision_or_path=None):
'--build-yugabyted-ui',
'--no-tests',
'--skip-java-build'],
- stdout=subprocess.DEVNULL,
+ stdout=subprocess.STDOUT,
stderr=subprocess.STDOUT,
cwd=self.path)
@@ -252,7 +377,7 @@ def destroy(self):
if self.config.allow_destroy_db:
self.logger.info("Destroying existing Yugabyte var/ directory")
- out = subprocess.check_output(['python3', 'bin/yugabyted', 'destroy'],
+ out = subprocess.check_output(['python3', 'bin/yb-ctl', 'destroy'],
stderr=subprocess.PIPE,
cwd=self.path, )
@@ -262,12 +387,12 @@ def destroy(self):
def start_database(self):
self.logger.info("Starting Yugabyte node")
- subprocess.call(['python3', 'bin/yugabyted', 'start'],
+ subprocess.call(['python3', 'bin/yb-ctl', 'start'],
# stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT,
cwd=self.path)
- out = subprocess.check_output(['python3', 'bin/yugabyted', 'status'],
+ out = subprocess.check_output(['python3', 'bin/yb-ctl', 'status'],
stderr=subprocess.PIPE,
cwd=self.path, )
@@ -283,7 +408,7 @@ def start_database(self):
def stop_database(self):
self.logger.info("Stopping Yugabyte node if exists")
- out = subprocess.check_output(['python3', 'bin/yugabyted', 'stop'],
+ out = subprocess.check_output(['python3', 'bin/yb-ctl', 'stop'],
stderr=subprocess.PIPE,
cwd=self.path, )
@@ -297,7 +422,7 @@ def stop_database(self):
shell=True)
-class YugabyteListOfQueries(ListOfQueries):
+class YugabyteCollectResult(CollectResult):
queries: List[YugabyteQuery] = None
@@ -305,4 +430,4 @@ class YugabyteResultsLoader(ResultsLoader):
def __init__(self):
super().__init__()
- self.clazz = YugabyteListOfQueries
+ self.clazz = YugabyteCollectResult
diff --git a/src/models/sql.py b/src/models/sql.py
index f9ca9284..e159466c 100644
--- a/src/models/sql.py
+++ b/src/models/sql.py
@@ -13,181 +13,266 @@
from objects import QueryTips, Field
from db.postgres import PostgresQuery, Table
from models.abstract import QTFModel
-from utils import get_alias_table_names, evaluate_sql, get_md5
+from utils import get_alias_table_names, evaluate_sql, get_md5, get_model_path, find_order_by_in_query
class SQLModel(QTFModel):
def create_tables(self, conn, skip_analyze=False, db_prefix=None):
- teardown_queries = []
- create_queries = []
- analyze_queries = []
- import_queries = []
- created_tables = []
-
- if DDLStep.DROP in self.config.ddls:
- _, teardown_queries = self.evaluate_ddl_queries(conn, DDLStep.DROP, db_prefix)
- teardown_queries.insert(0, "-- DROP QUERIES")
-
- if DDLStep.CREATE in self.config.ddls:
- created_tables, create_queries = self.evaluate_ddl_queries(conn, DDLStep.CREATE,
- db_prefix)
- create_queries.insert(0, "-- CREATE QUERIES")
-
- if DDLStep.IMPORT in self.config.ddls:
- _, import_queries = self.evaluate_ddl_queries(conn, DDLStep.IMPORT, db_prefix)
- import_queries.insert(0, "-- IMPORT QUERIES")
-
- if DDLStep.ANALYZE in self.config.ddls:
- analyzed_tables, analyze_queries = self.evaluate_ddl_queries(conn, DDLStep.ANALYZE,
- db_prefix)
- create_queries.insert(0, "-- ANALYZE QUERIES")
+ _, _, teardown_queries = self.evaluate_ddl_queries(conn, DDLStep.DROP, DDLStep.DROP in self.config.ddls,
+ db_prefix)
+ teardown_queries.insert(0, "-- DROP QUERIES")
+
+ created_tables, non_catalog_tables, create_queries = self.evaluate_ddl_queries(conn,
+ DDLStep.CREATE,
+ DDLStep.CREATE in self.config.ddls,
+ db_prefix)
+ create_queries.insert(0, "-- CREATE QUERIES")
+
+ _, _, import_queries = self.evaluate_ddl_queries(conn,
+ DDLStep.IMPORT,
+ DDLStep.IMPORT in self.config.ddls,
+ db_prefix)
+ import_queries.insert(0, "-- IMPORT QUERIES")
+
+ _, _, analyze_queries = self.evaluate_ddl_queries(conn,
+ DDLStep.ANALYZE,
+ DDLStep.ANALYZE in self.config.ddls,
+ db_prefix)
+ analyze_queries.insert(0, "-- ANALYZE QUERIES")
if not created_tables:
# try to load current tables
with conn.cursor() as cur:
- created_tables = self.load_tables_from_public(cur)
+ created_tables, non_catalog_tables = self.load_tables_from_public(cur)
+
+ self.load_table_stats(conn.cursor(), created_tables)
+
+ return created_tables, non_catalog_tables, teardown_queries, create_queries, analyze_queries, import_queries
- return created_tables, teardown_queries + create_queries + analyze_queries + import_queries
+ def get_valid_file_path(self, file_name: str, ddl_prefix: str):
+ complete_file_name = \
+ f"{ddl_prefix}.{file_name}" \
+ if (ddl_prefix and
+ exists(f"{get_model_path(self.config.model)}/{ddl_prefix}.{file_name}.sql")) \
+ else file_name
- def evaluate_ddl_queries(self, conn,
- step_prefix: DDLStep,
- db_prefix=None):
+ return f"{get_model_path(self.config.model)}/{complete_file_name}.sql"
+
+ def evaluate_ddl_queries(self, conn, step_prefix: DDLStep, do_execute: bool, db_prefix: str = None, ):
self.logger.info(f"Evaluating DDL {step_prefix.name} step")
created_tables: List[Table] = []
- file_name = step_prefix.name.lower()
-
+ non_catalog_tables: List[Table] = []
db_prefix = self.config.ddl_prefix or db_prefix
- if db_prefix and exists(f"sql/{self.config.model}/{db_prefix}.{file_name}.sql"):
- file_name = f"{db_prefix}.{file_name}"
+ path_to_file = self.get_valid_file_path(step_prefix.name.lower(), db_prefix)
model_queries = []
+
try:
with conn.cursor() as cur:
- evaluate_sql(cur, f"SET statement_timeout = '{self.config.ddl_query_timeout}s'")
-
- path_to_file = f"sql/{self.config.model}/{file_name}.sql"
+ if do_execute:
+ evaluate_sql(cur, f"SET statement_timeout = '{self.config.ddl_query_timeout}s'")
if not exists(path_to_file):
self.logger.warn(f"Unable to locate file {path_to_file}")
else:
- with open(f"sql/{self.config.model}/{file_name}.sql", "r") as sql_file:
+ with open(path_to_file, "r") as sql_file:
full_queries = self.apply_variables('\n'.join(sql_file.readlines()))
- for query in tqdm(full_queries.split(";")):
- try:
- if cleaned := query.lstrip():
- model_queries.append(cleaned)
- if step_prefix == DDLStep.IMPORT:
- self.import_from_local(cur, cleaned)
- else:
- evaluate_sql(cur, cleaned)
- except psycopg2.Error as e:
- self.logger.exception(e)
- raise e
- if step_prefix == DDLStep.CREATE:
- created_tables = self.load_tables_from_public(cur)
-
- return created_tables, model_queries
+ for query in tqdm(sqlparse.split(full_queries)):
+ # Nasty fix for RPC timeout
+ executed = False
+ while not executed:
+ try:
+ model_queries = self.process_query(cur, query, model_queries, do_execute=do_execute)
+ executed = True
+ except Exception as e:
+ if "RPC" not in str(e):
+ raise e
+
+
+ if step_prefix == DDLStep.CREATE and do_execute:
+ created_tables, non_catalog_tables = self.load_tables_from_public(cur)
+
+ return created_tables, non_catalog_tables, model_queries
except Exception as e:
self.logger.exception(e)
raise e
- def import_from_local(self, cur, cleaned):
+ def process_query(self, cur,
+ query: str,
+ model_queries: List[str],
+ do_execute: bool):
+ query = query.lstrip()
+ if not query:
+ return model_queries
+
+ model_queries.append(query)
+ if do_execute:
+ if not self.try_to_handle_copy(cur, query):
+ evaluate_sql(cur, query)
+
+ return model_queries
+
+ @staticmethod
+ def parse_with_param(param_name, params_str, value):
+ pattern = rf"(?i){param_name}\s{value}"
+ matches = re.findall(pattern, params_str, re.IGNORECASE)
+ return matches[0] if matches else None
+
+ def try_to_handle_copy(self, cur, query: str):
copy_re = r"(?i)\bCOPY\b\s(.+)\s\bFROM\b\s\'(.*)\'\s\bWITH\b\s\((.*\,?)\)"
- parse_re = re.findall(copy_re, cleaned, re.MULTILINE)[0]
- table_name = parse_re[0]
- local_path = parse_re[1]
- params = parse_re[2]
-
- delimiter = ","
- file_format = None
- null_format = ''
- if 'delimiter' in params.lower():
- delimiter = re.findall(r"(?i)delimiter\s\'(.{1,3})\'", params)[0]
- if delimiter == "\\t":
- delimiter = "\t"
- if 'format' in params.lower():
- file_format = re.findall(r"(?i)format\s([a-zA-Z]+)", params)[0]
- if 'null' in params.lower():
- null_format = re.findall(r"(?i)null\s\'([a-zA-Z]+)\'", params)[0]
-
- if 'csv' not in file_format.lower():
- raise AttributeError("Can't import from non CSV files")
+ parse_command = re.findall(copy_re, query, re.MULTILINE)
+
+ if not parse_command:
+ return False
+
+ table_name, local_path, params_str = parse_command[0]
+ delimiter = self.parse_with_param('delimiter', params_str, "\'(.{1,3})\'") or ","
+ file_format = self.parse_with_param('format', params_str, '[a-zA-Z]+')
+ null_format = self.parse_with_param('null', params_str, '\'([a-zA-Z]+)\'') or ''
+
+ if delimiter == "\\t":
+ delimiter = "\t"
+
+ if file_format is None or 'csv' not in file_format.lower():
+ raise AttributeError("Can only import from CSV files")
+
+ if not os.path.isfile(local_path):
+ raise FileNotFoundError(f"The file {local_path} does not exist")
with open(local_path, "r") as csv_file:
- cur.copy_from(csv_file, table_name,
- sep=delimiter,
- null=null_format)
+ cur.copy_from(csv_file, table_name, sep=delimiter, null=null_format)
- def load_tables_from_public(self, cur):
- created_tables = []
+ return True
- self.logger.info("Loading tables...")
- cur.execute(
- """
- select table_name, table_schema
- from information_schema.tables
- where table_schema = 'public' or table_schema = 'pg_catalog';
+ def load_tables_from_public(self, cur):
+ catalog_schema = ", 'pg_catalog'" if self.config.load_catalog_tables else ""
+
+ # we are assuming no table name conflicts between public and pg_catalog schemas for now.
+ # column_width (defined width) is -1 for text, array type, etc. types with unbound length)
+ self.logger.info("Loading tables, columns and indexes...")
+ evaluate_sql(
+ cur,
+ f"""
+ select
+ relname as table_name,
+ attname as column_name,
+ attnum as column_position,
+ case when attlen > 0 then attlen else atttypmod end column_width,
+ coalesce(index_names, '{{}}') as index_names,
+ nspname as namespace_name
+ from
+ pg_namespace nc
+ join pg_class c on nc.oid = relnamespace
+ join pg_attribute a on attrelid = c.oid
+ left join (
+ select
+ array_agg(relname) as index_names,
+ indrelid,
+ keycol
+ from (
+ select relname, indrelid, unnest(indkey) keycol
+ from pg_index ix join pg_class ci on ix.indexrelid = ci.oid
+ ) indexes
+ group by
+ indrelid,
+ keycol
+ ) i on i.indrelid = c.oid
+ and i.keycol = a.attnum
+ where
+ relkind = 'r'
+ and attnum >= 0
+ and nspname in ('public'{catalog_schema})
+ order by
+ nspname,
+ relname,
+ attnum;
""")
- tables = []
- result = list(cur.fetchall())
- tables.extend((row[0], row[1])
- for row in result
- if row[1] not in ["information_schema"])
-
- self.logger.info("Loading columns and constraints...")
- for table_name, schema_name in tables:
- evaluate_sql(
- cur,
- f"""
- select column_name
- from information_schema.columns
- where table_schema = '{schema_name}'
- and table_name = '{table_name}';
- """
- )
-
- columns = [row[0] for row in list(cur.fetchall())]
-
- evaluate_sql(
- cur,
- f"""
- select
- t.relname as table_name,
- i.relname as index_name,
- a.attname as column_name
- from
- pg_class t,
- pg_class i,
- pg_index ix,
- pg_attribute a
- where
- t.oid = ix.indrelid
- and i.oid = ix.indexrelid
- and a.attrelid = t.oid
- and a.attnum = ANY(ix.indkey)
- and t.relkind = 'r'
- and t.relname like '{table_name}'
- order by
- t.relname,
- i.relname;
- """
- )
-
- fields = []
-
- result = list(cur.fetchall())
- try:
- for column in columns:
- is_indexed = any(column == row[2] for row in result)
- fields.append(Field(column, is_indexed))
- except Exception as e:
- self.logger.exception(result, e)
-
- created_tables.append(Table(name=table_name, fields=fields, size=0))
-
- return created_tables
+
+ created_tables = []
+ non_catalog_tables = []
+ table = Table()
+ for tname, cname, cpos, clen, inames, nname in cur.fetchall():
+ if tname != table.name:
+ table = Table(name=tname, fields=[], rows=0, size=0)
+ created_tables.append(table)
+ if nname != 'pg_catalog':
+ non_catalog_tables.append(table)
+
+ table.fields.append(Field(name=cname, position=cpos,
+ is_index=bool(inames),
+ indexes=inames,
+ defined_width=clen))
+
+ return created_tables, non_catalog_tables
+
+ def load_table_stats(self, cur, tables):
+ catalog_schema = ", 'pg_catalog'" if self.config.load_catalog_tables else ""
+
+ self.logger.info("Loading table statistics...")
+ tmap = {}
+ for t in tables:
+ if t.name in tmap:
+ raise AssertionError(f"Found multiple tables with the same name: {t.name}")
+ tmap[t.name] = t
+
+ evaluate_sql(
+ cur,
+ f"""
+ select
+ c.relname table_name,
+ c.reltuples as rows
+ from
+ pg_class c,
+ pg_namespace ns
+ where
+ ns.oid = c.relnamespace
+ and c.relkind = 'r'
+ and ns.nspname in ('public'{catalog_schema})
+ and c.relname =any(array{list(tmap)});
+ """
+ )
+
+ for tname, rows in cur.fetchall():
+ tmap[tname].rows = rows
+
+ self.logger.info("Loading column statistics...")
+ evaluate_sql(
+ cur,
+ f"""
+ select
+ c.relname as table_name,
+ a.attname as column_name,
+ a.attnum as column_position,
+ s.stawidth as avg_width
+ from
+ pg_namespace ns
+ join pg_class c on ns.oid = c.relnamespace
+ join pg_attribute a on a.attrelid = c.oid
+ left join pg_statistic s on s.starelid = c.oid
+ and a.attnum = s.staattnum
+ where
+ c.relkind = 'r'
+ and a.attnum > 0
+ and ns.nspname in ('public'{catalog_schema})
+ and c.relname =any(array{list(tmap)});
+ """
+ )
+
+ for tname, cname, cpos, cwidth in cur.fetchall():
+ if cwidth:
+ field = tmap[tname].fields[cpos - 1]
+ if field.name != cname or field.position != cpos:
+ raise AssertionError(''.join([
+ f"Field position mismatch in table {tname}:",
+ f" the fields[{cpos - 1}] should be {cname}",
+ f" but {field.name} and its position={field.position}"]))
+ field.avg_width = cwidth
+
+ self.logger.debug("Loaded table and column metadata:")
+ for t in tables:
+ self.logger.debug(f"{t}")
@staticmethod
def get_comments(full_query):
@@ -213,25 +298,36 @@ def get_query_hint_tips(self, full_query):
comment_line.replace("-- tags: ", "").split(",")]
if comment_line.startswith("-- max_timeout: "):
tips.max_timeout = comment_line.replace("-- max_timeout: ", "").strip()
+ if comment_line.startswith("-- debug_hints: "):
+ tips.debug_hints = comment_line.replace("-- debug_hints: ", "").strip()
return tips
def get_queries(self, tables):
queries = []
- query_file_lists = sorted(list(glob.glob(f"sql/{self.config.model}/queries/*.sql")))
+ query_file_lists = sorted(list(glob.glob(f"{get_model_path(self.config.model)}/queries/*.sql")))
for query in query_file_lists:
with open(query, "r") as query_file:
full_queries = self.apply_variables(''.join(query_file.readlines()))
query_tips = self.get_query_hint_tips(full_queries)
+ query_debug_queries = []
for file_query in full_queries.split(";"):
if cleaned := sqlparse.format(file_query.lstrip(), strip_comments=True).strip():
- tables_in_query = get_alias_table_names(cleaned, tables)
- queries.append(PostgresQuery(
- tag=os.path.basename(query).replace(".sql", ""),
- query=cleaned,
- query_hash=get_md5(cleaned),
- tables=tables_in_query,
- optimizer_tips=query_tips))
+ if cleaned.lower().startswith("set "):
+ query_debug_queries.append(cleaned)
+ else:
+ current_tips = query_tips.copy()
+ current_debug_queries = query_debug_queries.copy()
+ current_tips.debug_queries = current_debug_queries
+
+ tables_in_query = get_alias_table_names(cleaned, tables)
+ queries.append(PostgresQuery(
+ tag=os.path.basename(query).replace(".sql", ""),
+ query=cleaned,
+ query_hash=get_md5(cleaned),
+ has_order_by=find_order_by_in_query(cleaned),
+ tables=tables_in_query,
+ optimizer_tips=current_tips))
if self.config.num_queries > 0:
queries = queries[:int(self.config.num_queries)]
diff --git a/src/objects.py b/src/objects.py
index 2c9c6a62..cd54077c 100644
--- a/src/objects.py
+++ b/src/objects.py
@@ -1,18 +1,49 @@
import dataclasses
-import json
-import os
-from typing import List, Dict, Type
+from enum import Enum
+import re
-from dacite import Config as DaciteConfig
-from dacite import from_dict
+from collections.abc import Iterable, Mapping
+from typing import List, Dict, Type
from config import Config
+from db.abstract import PlanNodeAccessor
+
+EXPLAIN = "EXPLAIN"
+
+
+class ExplainFlags(Enum):
+ ANALYZE = "ANALYZE"
+ DIST = "DIST"
+ VERBOSE = "VERBOSE"
+ TIMING = "TIMING"
+
+ COSTS_OFF = "COSTS OFF"
+
+
+@dataclasses.dataclass
+class FieldInTableHelper:
+ table_name: str
+ field_name: str
+
+ def copy(self):
+ return FieldInTableHelper(self.table_name, self.field_name)
+
+ def __hash__(self):
+ return hash(f"{self.table_name}.{self.field_name}")
@dataclasses.dataclass
class Field:
name: str = None
+ position: int = None
is_index: bool = None
+ indexes: List[str] = None
+ defined_width: int = None
+ avg_width: int = None
+
+ def copy(self):
+ return Field(self.name, self.position, self.is_index, self.indexes.copy(),
+ self.defined_width, self.avg_width)
@dataclasses.dataclass
@@ -20,8 +51,31 @@ class Table:
alias: str = None
name: str = None
fields: List[Field] = None
+ rows: int = 0
size: int = 0
+ def copy(self):
+ fields = [field.copy() for field in self.fields]
+ return Table(self.alias, self.name, fields, self.rows, self.size)
+
+ def __hash__(self):
+ return hash(f"{self.alias}.{self.name}")
+
+
+@dataclasses.dataclass
+class ExecutionPlan:
+ full_str: str = ""
+
+ def get_estimated_cost(self):
+ return -1
+
+ def is_present(self):
+ return self.full_str != "" and self.full_str is not None
+
+ def get_clean_plan(self, execution_plan=None):
+ # todo get plan tree instead here to support plan comparison between DBs
+ pass
+
@dataclasses.dataclass
class QueryTips:
@@ -29,6 +83,38 @@ class QueryTips:
reject: List[str] = dataclasses.field(default_factory=list)
tags: List[str] = dataclasses.field(default_factory=list)
max_timeout: str = dataclasses.field(default_factory=str)
+ debug_hints: str = dataclasses.field(default_factory=str)
+ debug_queries: List[str] = dataclasses.field(default_factory=list)
+
+ def copy(self):
+ return QueryTips(self.accept.copy(),
+ self.reject.copy(),
+ self.tags.copy(),
+ self.max_timeout,
+ self.debug_hints,
+ self.debug_queries.copy())
+
+
+@dataclasses.dataclass
+class QueryStats:
+ calls: int
+ total_time: float
+ min_time: float
+ max_time: float
+ mean_time: float
+ rows: int
+ latency: str
+
+ def __str__(self):
+ return (
+ f"Calls: {self.calls}\n"
+ f"Total time: {self.total_time}\n"
+ f"Min time: {self.min_time}\n"
+ f"Max time: {self.max_time}\n"
+ f"Mean time: {self.mean_time}\n"
+ f"Rows: {self.rows}\n"
+ f"Latency JSON: {self.latency}"
+ )
@dataclasses.dataclass
@@ -38,13 +124,18 @@ class Query:
query_hash: str = ""
tables: List[Table] = None
- optimizer_tips: QueryTips = None
+ optimizer_tips: QueryTips = dataclasses.field(default_factory=QueryTips)
explain_hints: str = ""
- execution_plan: 'ExecutionPlan' = None
+ # internal field to detect duplicates
+ cost_off_explain: 'ExecutionPlan' = dataclasses.field(default_factory=ExecutionPlan)
+
+ execution_plan: 'ExecutionPlan' = dataclasses.field(default_factory=ExecutionPlan)
execution_time_ms: float = 0
result_cardinality: int = 0
result_hash: str = None
+ has_order_by: bool = False
+ query_stats: QueryStats = None
parameters: List = None
@@ -52,17 +143,23 @@ class Query:
execution_plan_heatmap: Dict[int, Dict[str, str]] = None
+ def create_copy(self):
+ return Query(self.tag, self.query, self.query_hash, self.tables,
+ execution_time_ms=-1,
+ has_order_by=self.has_order_by,
+ execution_plan=ExecutionPlan("NOT FOUND"),
+ optimizations=[])
+
def get_query(self):
return self.query
- def get_explain(self):
- return f"{Config.explain_clause} {self.query}"
+ def get_explain(self, explain_clause: str = None, options: List[ExplainFlags] = None):
+ if not explain_clause:
+ explain_clause = Config().explain_clause
- def get_heuristic_explain(self):
- return f"EXPLAIN {self.query}"
+ options_clause = f" ({', '.join([opt.value for opt in options])})" if options else ""
- def get_explain_analyze(self):
- return f"EXPLAIN ANALYZE {self.query}"
+ return f"{explain_clause}{options_clause} {self.query}"
def compare_plans(self, execution_plan: Type['ExecutionPlan']):
pass
@@ -71,53 +168,197 @@ def heatmap(self):
pass
def get_best_optimization(self, config):
+ return self
+
+ def get_reportable_query(self):
+ return self.query.replace("|", "\|")
+
+ def get_inconsistent_results(self):
pass
+ def __eq__(self, other):
+ return self.query_hash == other.query_hash
+
+ def __hash__(self):
+ return hash(self.query_hash)
+
@dataclasses.dataclass
class Optimization(Query):
pass
-@dataclasses.dataclass
-class ListOfQueries:
- db_version: str = ""
- git_message: str = ""
- model_queries: List[str] = None
- queries: List[Type[Query]] = None
-
- def append(self, new_element):
- if not self.queries:
- self.queries = [new_element, ]
- else:
- self.queries.append(new_element)
-
- # CPUs are cheap in 2022
- self.queries.sort(key=lambda q: q.query_hash)
-
-
-class EPNode:
- def __init__(self):
- self.root: 'EPNode' | None = None
- self.childs: List['EPNode'] = []
- self.type: str = ""
- self.full_str: str = ""
+class PlanNode:
+ def __init__(self, accessor: PlanNodeAccessor, node_type, node_name):
+ self.acc: PlanNodeAccessor = accessor
+ self.node_type: str = node_type
self.level: int = 0
+ self.name: str = node_name
+ self.properties: Mapping[str: str] = dict()
+ self.child_nodes: Iterable[PlanNode] = list()
+
+ self.startup_cost: float = 0.0
+ self.total_cost: float = 0.0
+ self.plan_rows: float = 0.0
+ self.plan_width: int = 0
+
+ self.startup_ms: float = 0.0
+ self.total_ms: float = 0.0
+ self.rows: float = 0.0
+ self.nloops: float = 0.0
+
+ def __cmp__(self, other):
+ pass # todo
def __str__(self):
- return self.full_str
+ return self.get_full_str(estimate=True, actual=True)
+
+ def get_full_str(self, estimate=True, actual=True, properties=False, level=False):
+ return ''.join([
+ f'{self.level}: ' if level else '',
+ self.name,
+ f' {self.get_estimate_str()}' if estimate else '',
+ f' {self.get_actual_str()}' if actual else '',
+ str(self.properties) if properties and len(self.properties) > 0 else '',
+ ])
+
+ def get_estimate_str(self):
+ return (f'(cost={self.startup_cost}..{self.total_cost} rows={self.plan_rows}'
+ f' width={self.plan_width})')
+
+ def get_actual_str(self):
+ return ((f'(actual time={self.startup_ms}..{self.total_ms} rows={self.rows}'
+ f' loops={self.nloops})') if self.nloops else ' (never executed)')
+
+ def has_valid_cost(self):
+ return self.acc.has_valid_cost(self)
+
+ # return False on success
+ def fixup_invalid_cost(self):
+ return self.acc.fixup_invalid_cost(self)
+
+ def get_property(self, key, with_label=False):
+ value = self.properties.get(key, '')
+ return (f'{key}: {value}' if with_label else value) if value else ''
+
+ def get_actual_row_adjusted_cost(self):
+ return ((float(self.total_cost) - float(self.startup_cost))
+ * float(self.rows) / float(self.plan_rows)
+ + float(self.startup_cost))
+
+
+class ScanNode(PlanNode):
+ def __init__(self, accessor, node_type, node_name, table_name, table_alias, index_name,
+ is_backward, is_distinct, is_parallel):
+ super().__init__(accessor, node_type, node_name)
+ self.table_name: str = table_name
+ self.table_alias: str = table_alias
+ self.index_name: str = index_name
+ self.is_backward: bool = is_backward
+ self.is_distinct: bool = is_distinct
+ self.is_parallel: bool = is_parallel
+
+ self.is_seq_scan = self.acc.is_seq_scan(self)
+ self.is_index_scan = self.acc.is_index_scan(self)
+ self.is_index_only_scan = self.acc.is_index_only_scan(self)
+ self.is_any_index_scan = self.is_index_scan or self.is_index_only_scan
+ def __str__(self):
+ return ' '.join(filter(lambda s: s,
+ [self.get_full_str(),
+ self.get_search_condition_str(with_label=True),
+ ('Partial Aggregate'
+ if self.is_scan_with_partial_aggregate() else '')]))
-@dataclasses.dataclass
-class ExecutionPlan:
- full_str: str
+ def get_search_condition_str(self, with_label=False):
+ return (' ' if with_label else ' AND ').join(
+ filter(lambda cond: cond,
+ [self.get_index_cond(with_label),
+ self.get_remote_filter(with_label),
+ self.get_remote_tfbr_filter(with_label),
+ self.get_local_filter(with_label),
+ ]))
- def get_estimated_cost(self):
- pass
+ def get_index_cond(self, with_label=False):
+ return self.acc.get_index_cond(self, with_label)
- def get_clean_plan(self, execution_plan=None):
- # todo get plan tree instead here to support plan comparison between DBs
- pass
+ def may_have_table_fetch_by_rowid(self):
+ return self.acc.may_have_table_fetch_by_rowid(self)
+
+ def get_remote_filter(self, with_label=False):
+ return self.acc.get_remote_filter(self, with_label)
+
+ # TFBR: Table Fetch By Rowid
+ def get_remote_tfbr_filter(self, with_label=False):
+ return self.acc.get_remote_tfbr_filter(self, with_label)
+
+ def get_local_filter(self, with_label=False):
+ return self.acc.get_local_filter(self, with_label)
+
+ def get_rows_removed_by_recheck(self, with_label=False):
+ return self.acc.get_rows_removed_by_recheck(self, with_label)
+
+ def has_no_filter(self):
+ return (not self.get_remote_filter()
+ and not self.get_remote_tfbr_filter()
+ and not self.get_local_filter()
+ and not self.get_rows_removed_by_recheck())
+
+ def is_scan_with_partial_aggregate(self):
+ return self.acc.is_scan_with_partial_aggregate(self)
+
+
+class JoinNode(PlanNode):
+ pass
+
+
+class AggregateNode(PlanNode):
+ pass
+
+
+class SortNode(PlanNode):
+ pass
+
+
+class PlanNodeVisitor:
+ pat = re.compile(r'([A-Z][a-z0-9]*)([A-Z])')
+
+ def visit(self, node):
+ snake_cased_class_name = self.pat.sub(r'\1_\2', node.__class__.__name__).lower()
+ method = f'visit_{snake_cased_class_name}'
+ visitor = getattr(self, method, self.generic_visit)
+ return visitor(node)
+
+ def generic_visit(self, node):
+ for child in node.child_nodes:
+ self.visit(child)
+
+
+class PlanPrinter(PlanNodeVisitor):
+ def __init__(self, estimate=True, actual=True, properties=False, level=False):
+ super().__init__()
+ self.plan_tree_str: str = ""
+ self.estimate = estimate
+ self.actual = actual
+ self.properties = properties
+ self.level = level
+
+ def generic_visit(self, node):
+ self.plan_tree_str += f"{'':>{node.level * 2}s}-> " if node.level else ''
+ self.plan_tree_str += node.get_full_str(self.estimate, self.actual,
+ properties=False, level=self.level)
+ if self.properties:
+ self.plan_tree_str += ''.join([
+ f"\n{'':>{node.level * 2}s} {key}: {value}"
+ for key, value in node.properties.items()])
+ self.plan_tree_str += '\n'
+ super().generic_visit(node)
+
+ @staticmethod
+ def build_plan_tree_str(node, estimate=True, actual=True, properties=False, level=False):
+ printer = PlanPrinter(estimate, actual, properties, level)
+ printer.visit(node)
+ return printer.plan_tree_str
@dataclasses.dataclass
@@ -145,27 +386,3 @@ def filter_optimization_tips(self, explain_hints):
break
return skip_optimization
-
-
-class EnhancedJSONEncoder(json.JSONEncoder):
- def default(self, o):
- if dataclasses.is_dataclass(o):
- return dataclasses.asdict(o)
- return super().default(o)
-
-
-class ResultsLoader:
-
- def __init__(self):
- self.clazz = ListOfQueries
-
- def get_queries_from_previous_result(self, previous_execution_path):
- with open(previous_execution_path, "r") as prev_result:
- return from_dict(self.clazz, json.load(prev_result), DaciteConfig(check_types=False))
-
- def store_queries_to_file(self, queries: Type[ListOfQueries], output_json_name: str):
- if not os.path.isdir("report"):
- os.mkdir("report")
-
- with open(f"report/{output_json_name}.json", "w") as result_file:
- result_file.write(json.dumps(queries, cls=EnhancedJSONEncoder))
diff --git a/src/reports/abstract.py b/src/reports/abstract.py
deleted file mode 100644
index 82b4e4cc..00000000
--- a/src/reports/abstract.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import difflib
-import os
-import shutil
-import subprocess
-import time
-from pathlib import Path
-
-from config import Config
-
-
-class Report:
- def __init__(self):
- self.config = Config()
- self.logger = self.config.logger
-
- self.report = f"= Optimizer {self.get_report_name()} Test Report \n" \
- f":source-highlighter: coderay\n" \
- f":coderay-linenums-mode: inline\n\n"
-
- self._start_collapsible("Configuration")
- self._start_source()
- self.report += str(self.config)
- self._end_source()
- self._end_collapsible()
-
- self.reported_queries_counter = 0
- self.queries = []
-
- self.start_date = time.strftime("%Y%m%d-%H%M%S")
-
- if self.config.clear:
- self.logger.info("Clearing report directory")
- shutil.rmtree("report", ignore_errors=True)
-
- if not os.path.isdir("report"):
- os.mkdir("report")
-
- if not os.path.isdir(f"report/{self.start_date}"):
- os.mkdir(f"report/{self.start_date}")
-
- def get_report_name(self):
- return ""
-
- def report_model(self, model_queries):
- if model_queries:
- self._start_collapsible("Model queries")
- self._start_source(["sql"])
- self.report += "\n".join(
- [query if query.endswith(";") else f"{query};" for query in model_queries])
- self._end_source()
- self._end_collapsible()
-
- def _add_double_newline(self):
- self.report += "\n\n"
-
- def _start_table(self, columns: str = "1"):
- self.report += f"[cols=\"{columns}\"]\n" \
- "|===\n"
-
- def _start_table_row(self):
- self.report += "a|"
-
- def _end_table_row(self):
- self.report += "\n"
-
- def _end_table(self):
- self.report += "|===\n"
-
- def _start_source(self, additional_tags=None):
- tags = f",{','.join(additional_tags)}" if additional_tags else ""
-
- self.report += f"[source{tags},linenums]\n----\n"
-
- def _end_source(self):
- self.report += "\n----\n"
-
- def _start_collapsible(self, name):
- self.report += f"""\n\n.{name}\n[%collapsible]\n====\n"""
-
- def _end_collapsible(self):
- self.report += """\n====\n\n"""
-
- @staticmethod
- def _get_plan_diff(original, changed):
- return "\n".join(
- text for text in difflib.unified_diff(original.split("\n"), changed.split("\n")) if
- text[:3] not in ('+++', '---', '@@ '))
-
- def publish_report(self, report_name):
- report_adoc = f"report/{self.start_date}/report_{report_name}_{self.config.output}.adoc"
-
- with open(report_adoc, "w") as file:
- file.write(self.report)
-
- self.logger.info(f"Generating report file from {report_adoc} and compiling html")
- subprocess.run(
- f'{self.config.asciidoctor_path} '
- f'-a stylesheet={os.path.abspath("css/adoc.css")} '
- f'{report_adoc}',
- shell=True)
-
- report_html_path = Path(f'report/{self.start_date}/report_{report_name}_{self.config.output}.html')
- self.logger.info(f"Done! Check report at {report_html_path.absolute()}")
diff --git a/src/reports/adoc/comparison.py b/src/reports/adoc/comparison.py
deleted file mode 100644
index 441be058..00000000
--- a/src/reports/adoc/comparison.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from sql_formatter.core import format_sql
-
-from objects import ListOfQueries, Query
-from reports.abstract import Report
-
-
-class ComparisonReport(Report):
- def __init__(self):
- super().__init__()
-
- self.queries = {}
-
- @classmethod
- def generate_report(cls,
- loq_yb: ListOfQueries,
- loq_pg: ListOfQueries):
- report = ComparisonReport()
-
- report.define_version(loq_yb.db_version, loq_pg.db_version)
- report.report_model(loq_yb.model_queries)
-
- for query in zip(loq_yb.queries, loq_pg.queries):
- report.add_query(*query)
-
- report.build_report()
- report.publish_report("cmp")
-
- def get_report_name(self):
- return "Comparison"
-
- def define_version(self, first_version, second_version):
- self.report += f"[VERSION]\n====\nYugabyte:\n{first_version}\n\nPostgres:\n{second_version}\n====\n\n"
-
- def add_query(self, first_query: Query, second_query: Query):
- if first_query.tag not in self.queries:
- self.queries[first_query.tag] = [[first_query, second_query], ]
- else:
- self.queries[first_query.tag].append([first_query, second_query])
-
- def build_report(self):
- # link to top
- self.report += "\n[#top]\n== Summary\n"
-
- num_columns = 5
- self._start_table("1,1,1,1,4")
- self.report += "|Yugabyte|Postgres|Ratio vs Postgres|Ratio vs Postgres x3|Query\n"
- for tag, queries in self.queries.items():
- self.report += f"{num_columns}+m|{tag}.sql\n"
- for query in queries:
- ratio = "{:.2f}".format(query[0].execution_time_ms / query[1].execution_time_ms if query[1].execution_time_ms != 0 else 99999999)
- ratio_x3 = query[0].execution_time_ms / (3 * query[1].execution_time_ms) if query[1].execution_time_ms != 0 else 99999999
- ratio_x3_str = "{:.2f}".format(query[0].execution_time_ms / (3 * query[1].execution_time_ms) if query[1].execution_time_ms != 0 else 99999999)
- color = "[green]" if ratio_x3 <= 1.0 else "[red]"
- self.report += f"|{query[0].execution_time_ms}\n" \
- f"|{query[1].execution_time_ms}\n" \
- f"a|*{ratio}*\n" \
- f"a|{color}#*{ratio_x3_str}*#\n"
- self.report += f"a|[#{query[0].query_hash}_top]\n<<{query[0].query_hash}>>\n"
- self._start_source(["sql"])
- self.report += format_sql(query[1].query.replace("|", "\|"))
- self._end_source()
- self.report += "\n"
- self._end_table_row()
- self._end_table()
-
- # different results links
- for tag in self.queries.keys():
- self.report += f"\n<<{tag}>>\n"
-
- for tag, queries in self.queries.items():
- self.report += f"\n[#{tag}]\n== {tag} queries file\n\n"
- for query in queries:
- self.__report_query(query[0], query[1])
-
- # noinspection InsecureHash
- def __report_query(self, yb_query: Query, pg_query: Query):
- self.reported_queries_counter += 1
-
- self.report += f"\n[#{yb_query.query_hash}]\n"
- self.report += f"=== Query {yb_query.query_hash}"
- self.report += f"\n{yb_query.tag}\n"
- self.report += "\n<>\n"
- self.report += f"\n<<{yb_query.query_hash}_top,Show in summary>>\n"
- self._add_double_newline()
-
- self._start_source(["sql"])
- self.report += format_sql(yb_query.query.replace("|", "\|"))
- self._end_source()
-
- self._add_double_newline()
-
- self._start_table("3")
- self.report += "|Metric|Yugabyte|Postgres\n"
- self._start_table_row()
- self.report += f"Cardinality|{yb_query.result_cardinality}|{pg_query.result_cardinality}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Estimated cost|{yb_query.execution_plan.get_estimated_cost()}|{pg_query.execution_plan.get_estimated_cost()}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Execution time|{yb_query.execution_time_ms}|{pg_query.execution_time_ms}"
- self._end_table_row()
- self._end_table()
-
- self._start_table()
- self._start_table_row()
-
- self._start_collapsible("Yugabyte version plan")
- self._start_source(["diff"])
- self.report += yb_query.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible("Postgres version plan")
- self._start_source(["diff"])
- self.report += pg_query.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_source(["diff"])
-
- diff = self._get_plan_diff(yb_query.execution_plan.full_str, pg_query.execution_plan.full_str)
- if not diff:
- diff = yb_query.execution_plan.full_str
-
- self.report += diff
- self._end_source()
- self._end_table_row()
-
- self.report += "\n"
-
- self._end_table()
-
- self._add_double_newline()
diff --git a/src/reports/adoc/regression.py b/src/reports/adoc/regression.py
deleted file mode 100644
index 0273c375..00000000
--- a/src/reports/adoc/regression.py
+++ /dev/null
@@ -1,259 +0,0 @@
-from dataclasses import dataclass
-
-from sql_formatter.core import format_sql
-
-from objects import ListOfQueries, Query
-from reports.abstract import Report
-
-
-@dataclass
-class ShortSummaryReport:
- diff_plans: int = 0
- diff_rpc_calls: int = 0
- diff_wait_times: int = 0
- diff_scanned_rows: int = 0
- diff_peak_memory: int = 0
-
-
-class RegressionReport(Report):
- def __init__(self):
- super().__init__()
-
- self.v1_name = None
- self.v2_name = None
- self.queries = {}
- self.short_summary = ShortSummaryReport()
-
- @classmethod
- def generate_report(cls,
- v1_name: str,
- v2_name: str,
- loq_v1: ListOfQueries,
- loq_v2: ListOfQueries):
- report = RegressionReport()
-
- report.define_version_names(v1_name, v2_name)
- report.define_version(loq_v1.db_version, loq_v2.db_version)
- report.report_model(loq_v1.model_queries)
-
- for query in zip(loq_v1.queries, loq_v2.queries):
- if query[0].query_hash != query[1].query_hash:
- raise AttributeError("Query hashes are not mathing, check input files")
-
- report.add_query(*query)
-
- report.build_report()
- report.publish_report("reg")
- report.publish_short_report()
-
- def get_report_name(self):
- return "Regression"
-
- def define_version(self, first_version, second_version):
- self.report += f"[GIT COMMIT/VERSION]\n====\n" \
- f"First:\n{first_version}\n\nSecond:\n{second_version}\n====\n\n"
-
- def add_query(self, first_query: Query, second_query: Query):
- if first_query.tag not in self.queries:
- self.queries[first_query.tag] = [[first_query, second_query], ]
- else:
- self.queries[first_query.tag].append([first_query, second_query])
-
- def build_report(self):
- # link to top
- self.add_plan_comparison()
- self.add_rpc_calls()
- self.add_rpc_wait_times()
- self.add_scanned_rows()
- self.add_peak_memory_collapsible()
-
- self.report += "\n[#query_summary]\n== Query Summary\n"
- num_columns = 4
- self._start_table("1,1,1,4")
- self.report += f"|{self.v1_name}|{self.v2_name}|Ratio (Second/First)|Query\n"
- for tag, queries in self.queries.items():
- self.report += f"{num_columns}+m|{tag}.sql\n"
- for query_id, query in enumerate(queries):
- same_plan = query[0].compare_plans(query[1].execution_plan)
- color = "[green]" if same_plan else "[orange]"
- ratio = "{:.2f}".format(
- query[1].execution_time_ms / query[0].execution_time_ms
- if query[0].execution_time_ms != 0 else 0)
-
- # insert anchor to the first query in file
- self.report += "a|"
- if query_id == 0:
- self.report += f"[#{tag}]\n"
-
- # append all query stats
- self.report += f"{query[0].execution_time_ms}\n" \
- f"|{query[1].execution_time_ms}\n" \
- f"a|{color}#*{ratio}*#\n"
- self.report += f"a|[#{query[0].query_hash}_query]\n" \
- f"<>\n\n" \
- f"<<{query[0].query_hash}>>\n"
- self._start_source(["sql"])
- self.report += format_sql(query[1].query.replace("|", "\|"))
- self._end_source()
- self.report += "\n"
- self._end_table_row()
- self._end_table()
-
- for tag, queries in self.queries.items():
- self.report += f"\n== {tag} queries file\n\n"
- for query in queries:
- self.__report_query(query[0], query[1])
-
- def add_plan_comparison(self):
- self._start_collapsible("Plan comparison")
- self.report += "\n[#plans_summary]\n"
- self._start_table("2")
- for tag, queries in self.queries.items():
- num_same_plans = sum(1 for query in queries
- if query[0].compare_plans(query[1].execution_plan))
- self.report += f"a|<<{tag}>>\n"
- self.short_summary.diff_plans = len(queries) - num_same_plans
- color = "[green]" if self.short_summary.diff_plans == 0 else "[orange]"
- self.report += f"a|{color}#*{self.short_summary.diff_plans}*#\n"
- self._end_table_row()
- self._end_table()
- self._end_collapsible()
-
- def add_rpc_calls(self):
- self._start_collapsible("RPC Calls")
- self.report += "\n[#rpc_summary]\n"
- self._start_table("2")
- for tag, queries in self.queries.items():
- self.short_summary.diff_rpc_calls = sum(
- query[0].execution_plan.get_rpc_calls() != query[1].execution_plan.get_rpc_calls()
- for query in queries
- )
- self.report += f"a|<<{tag}>>\n"
- color = "[green]" if self.short_summary.diff_rpc_calls == 0 else "[orange]"
- self.report += f"a|{color}#*{self.short_summary.diff_rpc_calls}*#\n"
- self._end_table_row()
- self._end_table()
- self._end_collapsible()
-
- def add_rpc_wait_times(self):
- self._start_collapsible("RPC Wait Times")
- self.report += "\n[#rpc_wait_summary]\n"
- self._start_table("2")
- for tag, queries in self.queries.items():
- self.short_summary.diff_wait_times = sum(
- query[0].execution_plan.get_rpc_wait_times() != query[1].execution_plan.get_rpc_wait_times()
- for query in queries
- )
- self.report += f"a|<<{tag}>>\n"
- color = "[green]" if self.short_summary.diff_wait_times == 0 else "[orange]"
- self.report += f"a|{color}#*{self.short_summary.diff_wait_times}*#\n"
- self._end_table_row()
- self._end_table()
- self._end_collapsible()
-
- def add_scanned_rows(self):
- self._start_collapsible("Scanned rows")
- self.report += "\n[#rows_summary]\n"
- self._start_table("2")
- for tag, queries in self.queries.items():
- num_same_plans = sum(
- query[0].execution_plan.get_scanned_rows() != query[1].execution_plan.get_scanned_rows()
- for query in queries
- )
- self.report += f"a|<<{tag}>>\n"
- color = "[green]" if num_same_plans == 0 else "[orange]"
- self.report += f"a|{color}#*{num_same_plans}*#\n"
- self._end_table_row()
- self._end_table()
- self._end_collapsible()
-
- def add_peak_memory_collapsible(self):
- self._start_collapsible("Peak memory")
- self.report += "\n[#memory_summary]\n"
- self._start_table("2")
- for tag, queries in self.queries.items():
- self.short_summary.diff_peak_memory = sum(
- query[0].execution_plan.get_peak_memory() != query[1].execution_plan.get_peak_memory()
- for query in queries
- )
- self.report += f"a|<<{tag}>>\n"
- color = "[green]" if self.short_summary.diff_peak_memory == 0 else "[orange]"
- self.report += f"a|{color}#*{self.short_summary.diff_peak_memory}*#\n"
- self._end_table_row()
- self._end_table()
- self._end_collapsible()
-
- # noinspection InsecureHash
- def __report_query(self, first_query: Query, second_query: Query):
- self.reported_queries_counter += 1
-
- self.report += f"\n[#{first_query.query_hash}]\n"
- self.report += f"=== Query {first_query.query_hash}"
- self.report += f"\nTags: `{first_query.tag}`\n"
- self.report += "\n<>\n"
- self.report += "\n<>\n"
- self.report += f"\n<<{first_query.query_hash}_query,Show in query summary>>\n"
- self._add_double_newline()
-
- self._start_source(["sql"])
- self.report += format_sql(first_query.query.replace("|", "\|"))
- self._end_source()
-
- self._add_double_newline()
-
- self._start_table("3")
- self.report += f"|Metric|{self.v1_name}|{self.v2_name}\n"
- self._start_table_row()
- self.report += f"Cardinality|{first_query.result_cardinality}|{second_query.result_cardinality}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Optimizer cost|{first_query.execution_plan.get_estimated_cost()}|{second_query.execution_plan.get_estimated_cost()}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Execution time|{first_query.execution_time_ms}|{second_query.execution_time_ms}"
- self._end_table_row()
- self._end_table()
-
- self._start_table()
- self._start_table_row()
-
- self._start_collapsible(f"{self.v1_name} version plan")
- self._start_source(["diff"])
- self.report += first_query.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible(f"{self.v2_name} version plan")
- self._start_source(["diff"])
- self.report += second_query.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_source(["diff"])
-
- diff = self._get_plan_diff(first_query.execution_plan.full_str, second_query.execution_plan.full_str)
- if not diff:
- diff = first_query.execution_plan.full_str
-
- self.report += diff
- self._end_source()
- self._end_table_row()
-
- self.report += "\n"
-
- self._end_table()
-
- self._add_double_newline()
-
- def define_version_names(self, v1_name, v2_name):
- self.v1_name = v1_name
- self.v2_name = v2_name
-
- def publish_short_report(self):
- with open(f"report/{self.start_date}/short_regression_summary.txt", "w") as short_summary:
- short_summary.write(f"Changed plans: {self.short_summary.diff_plans}\n")
- short_summary.write(f"Changed scanned rows: {self.short_summary.diff_scanned_rows}\n")
- short_summary.write(f"Changed RPC calls: {self.short_summary.diff_rpc_calls}\n")
- short_summary.write(f"Changed RPC wait times: {self.short_summary.diff_wait_times}\n")
- short_summary.write(f"Changed peak memory: {self.short_summary.diff_peak_memory}\n")
-
diff --git a/src/reports/adoc/score.py b/src/reports/adoc/score.py
deleted file mode 100644
index 15ad3299..00000000
--- a/src/reports/adoc/score.py
+++ /dev/null
@@ -1,469 +0,0 @@
-import os
-from math import log
-
-import numpy as np
-from typing import Type
-
-from matplotlib import pyplot as plt
-from sql_formatter.core import format_sql
-
-from objects import ListOfQueries, Query
-from reports.abstract import Report
-from utils import allowed_diff, disabled_path
-
-
-class ScoreReport(Report):
- def __init__(self):
- super().__init__()
-
- os.mkdir(f"report/{self.start_date}/imgs")
-
- self.queries = {}
- self.overall_plots = {
- 'color': 'k.',
- 'x_values': [],
- 'y_values': []
- }
-
- @classmethod
- def generate_report(cls, loq: ListOfQueries, pg_loq: ListOfQueries = None):
- report = ScoreReport()
-
- report.define_version(loq.db_version)
- report.report_model(loq.model_queries)
-
- for qid, query in enumerate(loq.queries):
- report.add_query(query, pg_loq.queries[qid] if pg_loq else None)
-
- report.build_report()
- report.publish_report("score")
-
- def get_report_name(self):
- return "score"
-
- def define_version(self, version):
- self.report += f"[VERSION]\n====\n{version}\n====\n\n"
-
- def calculate_score(self, query):
- if query.execution_time_ms == 0:
- return -1
- else:
- return "{:.2f}".format(
- query.get_best_optimization(
- self.config).execution_time_ms / query.execution_time_ms)
-
- def create_default_query_plot(self):
- x_data = []
- y_data = []
-
- for tag, queries in self.queries.items():
- for yb_pg_queries in queries:
- query = yb_pg_queries[0]
- if query.execution_time_ms:
- x_data.append(query.execution_plan.get_estimated_cost())
- y_data.append(query.execution_time_ms)
-
- fig = self.generate_regression_and_standard_errors(x_data, y_data)
-
- file_name = f'imgs/all_queries_defaults.png'
- fig.savefig(f"report/{self.start_date}/{file_name}", dpi=300)
- plt.close()
-
- return file_name
-
- def create_optimizations_plot(self):
- x_data = []
- y_data = []
-
- for tag, queries in self.queries.items():
- for yb_pg_queries in queries:
- query = yb_pg_queries[0]
- x_data += [q.execution_plan.get_estimated_cost() for q in query.optimizations
- if q.execution_time_ms != 0 and not disabled_path(q)]
- y_data += [q.execution_time_ms for q in query.optimizations
- if q.execution_time_ms != 0 and not disabled_path(q)]
-
- fig = self.generate_regression_and_standard_errors(x_data, y_data)
-
- file_name = f'imgs/all_optimizations.png'
- fig.savefig(f"report/{self.start_date}/{file_name}", dpi=300)
- plt.close()
-
- return file_name
-
- @staticmethod
- def generate_regression_and_standard_errors(x_data, y_data):
- x = np.array(x_data)
- y = np.array(y_data)
- n = x.size
-
- a, b = np.polyfit(x, y, deg=1)
- y_est = a * x + b
- y_err = (y - y_est).std() * np.sqrt(1 / n + (x - x.mean()) ** 2 / np.sum((x - x.mean()) ** 2))
-
- fig, ax = plt.subplots()
-
- plt.xlabel('Predicted cost')
- plt.ylabel('Execution time [ms]')
-
- ax.plot(x, y_est, '-')
- ax.fill_between(x, y_est - y_err, y_est + y_err, alpha=0.2)
- ax.plot(x, y, 'k.')
-
- return fig
-
- def create_query_plot(self, best_optimization, optimizations, query):
- if not optimizations:
- return "NO PLOT"
-
- plt.xlabel('Execution time [ms]')
- plt.ylabel('Predicted cost')
-
- plt.plot([q.execution_time_ms for q in optimizations if q.execution_time_ms != 0],
- [q.execution_plan.get_estimated_cost() for q in optimizations if
- q.execution_time_ms != 0], 'k.',
- [query.execution_time_ms],
- [query.execution_plan.get_estimated_cost()], 'r^',
- [best_optimization.execution_time_ms],
- [best_optimization.execution_plan.get_estimated_cost()], 'go')
-
- file_name = f'imgs/query_{self.reported_queries_counter}.png'
- plt.savefig(f"report/{self.start_date}/{file_name}")
- plt.close()
-
- return file_name
-
- def add_query(self, query: Type[Query], pg: Type[Query] | None):
- if query.tag not in self.queries:
- self.queries[query.tag] = [[query, pg], ]
- else:
- self.queries[query.tag].append([query, pg])
-
- def build_report(self):
- self._start_table("2")
- self.report += "|Default query plans|Optimizations\n"
- self.report += f"a|image::{self.create_default_query_plot()}[Defaults,align=\"center\"]\n"
- self.report += f"a|image::{self.create_optimizations_plot()}[Optimizations,align=\"center\"]\n"
- self._end_table()
-
- self.report += "\n== QO score\n"
-
- yb_bests = 0
- pg_bests = 0
- qe_bests_geo = 1
- qo_yb_bests_geo = 1
- qo_pg_bests_geo = 1
- total = 0
- for queries in self.queries.values():
- for query in queries:
- yb_query = query[0]
- pg_query = query[1]
-
- yb_best = yb_query.get_best_optimization(self.config)
- pg_best = pg_query.get_best_optimization(self.config)
-
- pg_success = pg_query.execution_time_ms != 0
-
- qe_bests_geo *= yb_best.execution_time_ms / pg_best.execution_time_ms if pg_success else 1
- qo_yb_bests_geo *= (
- yb_query.execution_time_ms if yb_query.execution_time_ms > 0 else 1.0) / (
- yb_best.execution_time_ms if yb_best.execution_time_ms > 0 else 1)
- qo_pg_bests_geo *= pg_query.execution_time_ms / pg_best.execution_time_ms if pg_best.execution_time_ms != 0 else 9999999
- yb_bests += 1 if yb_query.compare_plans(yb_best.execution_plan) else 0
- pg_bests += 1 if pg_success and pg_query.compare_plans(
- pg_best.execution_plan) else 0
-
- total += 1
-
- self._start_table("4,1,1")
- self.report += "|Statistic|YB|PG\n"
- self.report += f"|Best execution plan picked|{'{:.2f}'.format(float(yb_bests) * 100 / total)}%|{'{:.2f}'.format(float(pg_bests) * 100 / total)}%\n"
- self.report += f"|Geomeric mean QE best\n2+m|{'{:.2f}'.format(qe_bests_geo ** (1 / total))}\n"
- self.report += f"|Geomeric mean QO default vs best|{'{:.2f}'.format(qo_yb_bests_geo ** (1 / total))}|{'{:.2f}'.format(qo_pg_bests_geo ** (1 / total))}\n"
- self._end_table()
-
- self.report += "\n[#top]\n== QE score\n"
-
- num_columns = 7
- for tag, queries in self.queries.items():
- self._start_table("1,1,1,1,1,1,4")
- self.report += "|YB|YB Best|PG|PG Best|Ratio YB vs PG|Ratio Best YB vs PG|Query\n"
- self.report += f"{num_columns}+m|{tag}.sql\n"
- for query in queries:
- yb_query = query[0]
- pg_query = query[1]
-
- yb_best = yb_query.get_best_optimization(self.config)
- pg_best = pg_query.get_best_optimization(self.config)
-
- pg_success = pg_query.execution_time_ms != 0
-
- default_yb_equality = "[green]" if yb_query.compare_plans(
- yb_best.execution_plan) else "[red]"
- default_pg_equality = "[green]" if pg_success and pg_query.compare_plans(
- pg_best.execution_plan) else "[red]"
-
- best_yb_pg_equality = "(eq) " if yb_best.compare_plans(
- pg_best.execution_plan) else ""
-
- ratio_x3 = yb_query.execution_time_ms / (
- 3 * pg_query.execution_time_ms) if pg_query.execution_time_ms != 0 else 99999999
- ratio_x3_str = "{:.2f}".format(
- yb_query.execution_time_ms / pg_query.execution_time_ms if pg_query.execution_time_ms != 0 else 99999999)
- ratio_color = "[green]" if ratio_x3 <= 1.0 else "[red]"
-
- ratio_best = yb_best.execution_time_ms / (
- 3 * pg_best.execution_time_ms) \
- if yb_best.execution_time_ms != 0 and pg_success else 99999999
- ratio_best_x3_str = "{:.2f}".format(
- yb_best.execution_time_ms / pg_best.execution_time_ms
- if yb_best.execution_time_ms != 0 and pg_success else 99999999)
- ratio_best_color = "[green]" if ratio_best <= 1.0 else "[red]"
-
- bitmap_flag = "[blue]" if pg_success and "bitmap" in pg_query.execution_plan.full_str.lower() else "[black]"
-
- self.report += f"a|[black]#*{'{:.2f}'.format(yb_query.execution_time_ms)}*#\n" \
- f"a|{default_yb_equality}#*{'{:.2f}'.format(yb_best.execution_time_ms)}*#\n" \
- f"a|{bitmap_flag}#*{'{:.2f}'.format(pg_query.execution_time_ms)}*#\n" \
- f"a|{default_pg_equality}#*{'{:.2f}'.format(pg_best.execution_time_ms)}*#\n" \
- f"a|{ratio_color}#*{ratio_x3_str}*#\n" \
- f"a|{ratio_best_color}#*{best_yb_pg_equality}{ratio_best_x3_str}*#\n"
- self.report += f"a|[#{yb_query.query_hash}_top]\n<<{yb_query.query_hash}>>\n"
- self._start_source(["sql"])
- self.report += format_sql(pg_query.query.replace("|", "\|"))
- self._end_source()
- self.report += "\n"
- self._end_table_row()
-
- self._end_table()
-
- # different results links
- for tag in self.queries.keys():
- self.report += f"\n<<{tag}>>\n"
-
- for tag, queries in self.queries.items():
- self.report += f"\n[#{tag}]\n== {tag} queries file\n\n"
- for query in queries:
- self.__report_query(query[0], query[1], True)
-
- def __report_near_queries(self, query: Type[Query]):
- if query.optimizations:
- best_optimization = query.get_best_optimization(self.config)
- if add_to_report := "".join(
- f"`{optimization.explain_hints}`\n\n"
- for optimization in query.optimizations
- if allowed_diff(self.config, best_optimization.execution_time_ms,
- optimization.execution_time_ms)):
- self._start_collapsible("Near best optimization hints")
- self.report += add_to_report
- self._end_collapsible()
-
- def __report_heatmap(self, query: Type[Query]):
- """
- Here is the deal. In PG plans we can separate each plan tree node by splitting by `->`
- When constructing heatmap need to add + or - to the beginning of string `\n`.
- So there is 2 splitters - \n and -> and need to construct correct result.
-
- :param query:
- :return:
- """
- # TODO FIX THIS!!!!!
- if not (execution_plan_heatmap := query.heatmap()):
- return
-
- best_decision = max(row['weight'] for row in execution_plan_heatmap.values())
- last_rowid = max(execution_plan_heatmap.keys())
- result = ""
- for row_id, row in execution_plan_heatmap.items():
- rows = row['str'].split("\n")
-
- if row['weight'] == best_decision:
- result = self.fix_last_newline_in_result(result, rows)
- result += "\n".join([f"+{line}" for line_id, line in enumerate(rows) if
- line_id != (len(rows) - 1)]) + f"\n{rows[-1]}"
- elif row['weight'] == 0:
- result = self.fix_last_newline_in_result(result, rows)
- result += "\n".join([f"-{line}" for line_id, line in enumerate(rows) if
- line_id != (len(rows) - 1)]) + f"\n{rows[-1]}"
- else:
- result += f"{row['str']}"
-
- # skip adding extra -> to the end of list
- if row_id != last_rowid:
- result += "->"
-
- self._start_collapsible("Plan heatmap")
- self._start_source(["diff"])
- self.report += result
- self._end_source()
- self._end_collapsible()
-
- @staticmethod
- def fix_last_newline_in_result(result, rows):
- if result:
- splitted_result = result.split("\n")
- result = "\n".join(splitted_result[:-1])
- last_newline = splitted_result[-1]
- rows[0] = f"{last_newline}{rows[0]}"
- result += "\n"
- return result
-
- # noinspection InsecureHash
- def __report_query(self, yb_query: Type[Query], pg_query: Type[Query], show_best: bool):
- yb_best = yb_query.get_best_optimization(self.config)
-
- self.reported_queries_counter += 1
-
- self.report += f"\n[#{yb_query.query_hash}]\n"
- self.report += f"=== Query {yb_query.query_hash}"
- self.report += f"\n{yb_query.tag}\n"
- self.report += "\n<>\n"
- self.report += f"\n<<{yb_query.query_hash}_top,Show in summary>>\n"
- self._add_double_newline()
-
- self._start_source(["sql"])
- self.report += format_sql(yb_query.query.replace("|", "\|"))
- self._end_source()
-
- self._add_double_newline()
- self.report += f"YB Default explain hints - `{yb_query.explain_hints}`"
- self._add_double_newline()
-
- if show_best:
- self._add_double_newline()
- self.report += f"YB Best explain hints - `{yb_best.explain_hints}`"
- self._add_double_newline()
-
- self.__report_near_queries(yb_query)
-
- filename = self.create_query_plot(yb_best, yb_query.optimizations, yb_query)
- self.report += f"image::{filename}[\"Query {self.reported_queries_counter}\",align=\"center\"]"
-
- self._add_double_newline()
-
- self._add_double_newline()
- default_yb_equality = "(eq) " if yb_query.compare_plans(
- yb_best.execution_plan) else ""
- default_pg_equality = ""
- default_yb_pg_equality = ""
-
- best_yb_pg_equality = ""
- if pg_query and pg_query.execution_time_ms != 0:
- self._start_table("5")
- self.report += "|Metric|YB|YB Best|PG|PG Best\n"
-
- pg_best = pg_query.get_best_optimization(self.config)
- default_pg_equality = "(eq) " if pg_query.compare_plans(
- pg_best.execution_plan) else ""
- best_yb_pg_equality = "(eq) " if yb_best.compare_plans(
- pg_best.execution_plan) else ""
- default_yb_pg_equality = "(eq) " if yb_query.compare_plans(
- pg_query.execution_plan) else ""
-
- if 'order by' in yb_query.query:
- self._start_table_row()
- self.report += \
- f"!! Result hash|{yb_query.result_hash}|{yb_best.result_hash}|{pg_query.result_hash}|{pg_best.result_hash}" \
- if pg_query.result_hash != yb_query.result_hash else \
- f"Result hash|`{yb_query.result_hash}|{yb_best.result_hash}|{pg_query.result_hash}|{pg_best.result_hash}"
- self._end_table_row()
-
- self._start_table_row()
- self.report += f"Cardinality|{yb_query.result_cardinality}|{yb_best.result_cardinality}|{pg_query.result_cardinality}|{pg_best.result_cardinality}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Estimated cost|{yb_query.execution_plan.get_estimated_cost()}|{default_yb_equality}{yb_best.execution_plan.get_estimated_cost()}|{pg_query.execution_plan.get_estimated_cost()}|{default_pg_equality}{pg_best.execution_plan.get_estimated_cost()}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Execution time|{'{:.2f}'.format(yb_query.execution_time_ms)}|{default_yb_equality}{'{:.2f}'.format(yb_best.execution_time_ms)}|{'{:.2f}'.format(pg_query.execution_time_ms)}|{default_pg_equality}{'{:.2f}'.format(pg_best.execution_time_ms)}"
- self._end_table_row()
- else:
- self._start_table("3")
- self.report += "|Metric|YB|YB Best\n"
-
- if yb_best.result_hash != yb_query.result_hash:
- self.report += f"!! Result hash|{yb_query.result_hash}|{yb_best.result_hash}"
- else:
- self.report += f"Result hash|{yb_query.result_hash}|{yb_best.result_hash}"
- self._end_table_row()
-
- self._start_table_row()
- self.report += f"Cardinality|{yb_query.result_cardinality}|{yb_best.result_cardinality}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Optimizer cost|{yb_query.execution_plan.get_estimated_cost()}|{default_yb_equality}{yb_best.execution_plan.get_estimated_cost()}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Execution time|{yb_query.execution_time_ms}|{default_yb_equality}{yb_best.execution_time_ms}"
- self._end_table_row()
- self._end_table()
-
- self._start_table()
- self._start_table_row()
-
- if pg_query and pg_query.execution_time_ms != 0:
- bitmap_used = "(bm) " if "bitmap" in pg_query.execution_plan.full_str.lower() else ""
- self._start_collapsible(f"{bitmap_used}PG plan")
- self._start_source(["diff"])
- self.report += pg_query.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- pg_best = pg_query.get_best_optimization(self.config)
- bitmap_used = "(bm) " if "bitmap" in pg_best.execution_plan.full_str.lower() else ""
- self._start_collapsible(f"{default_pg_equality}{bitmap_used}PG best")
- self._start_source(["diff"])
- self.report += pg_best.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible(f"{default_yb_pg_equality}PG default vs YB default")
- self._start_source(["diff"])
- # postgres plan should be red
- self.report += self._get_plan_diff(
- yb_query.execution_plan.full_str,
- pg_query.execution_plan.full_str,
- )
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible(f"{best_yb_pg_equality}PG best vs YB best")
- self._start_source(["diff"])
- self.report += self._get_plan_diff(
- yb_best.execution_plan.full_str,
- pg_best.execution_plan.full_str,
- )
- self._end_source()
- self._end_collapsible()
-
- if show_best:
- self.__report_heatmap(yb_query)
-
- self._start_collapsible("YB default plan")
- self._start_source(["diff"])
- self.report += yb_query.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible(f"YB best plan")
- self._start_source(["diff"])
- self.report += yb_best.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self.report += f"{default_yb_equality}YB default vs YB best\n"
- self._start_source(["diff"])
- diff = self._get_plan_diff(yb_query.execution_plan.full_str,
- yb_best.execution_plan.full_str)
- if not diff:
- diff = yb_query.execution_plan.full_str
-
- self.report += diff
- self._end_source()
- self._end_table_row()
-
- self.report += "\n"
-
- self._end_table()
-
- self._add_double_newline()
diff --git a/src/reports/adoc/selectivity.py b/src/reports/adoc/selectivity.py
deleted file mode 100644
index fdc32a3e..00000000
--- a/src/reports/adoc/selectivity.py
+++ /dev/null
@@ -1,193 +0,0 @@
-from sql_formatter.core import format_sql
-
-from objects import ListOfQueries, Query
-from reports.abstract import Report
-from utils import allowed_diff
-
-
-class SelectivityReport(Report):
- def __init__(self):
- super().__init__()
-
- self.different_explain_plans = []
- self.same_execution_plan = []
- self.almost_same_execution_time = []
- self.improved_execution_time = []
- self.worse_execution_time = []
-
- def get_report_name(self):
- return "Default/Analyze/Analyze+Statistics"
-
- @classmethod
- def generate_report(cls,
- loq_default: ListOfQueries,
- loq_default_analyze: ListOfQueries,
- loq_ta: ListOfQueries,
- loq_ta_analyze: ListOfQueries,
- loq_stats: ListOfQueries,
- loq_stats_analyze: ListOfQueries):
- report = SelectivityReport()
-
- report.report_model(loq_default.model_queries)
-
- for query in zip(loq_default.queries,
- loq_default_analyze.queries,
- loq_ta.queries,
- loq_ta_analyze.queries,
- loq_stats.queries,
- loq_stats_analyze.queries):
- report.add_query(*query)
-
- report.build_report()
- report.publish_report("sltvty")
-
- def add_query(self,
- default: Query,
- default_analyze: Query,
- ta: Query,
- ta_analyze: Query,
- stats: Query,
- stats_analyze: Query
- ):
- queries_tuple = [default, default_analyze, ta, ta_analyze, stats, stats_analyze]
- if not default.compare_plans(default_analyze.execution_plan) or \
- not ta.compare_plans(ta_analyze.execution_plan) or \
- not stats.compare_plans(stats_analyze.execution_plan):
- self.different_explain_plans.append(queries_tuple)
-
- if default.compare_plans(stats_analyze.execution_plan):
- self.same_execution_plan.append(queries_tuple)
- elif allowed_diff(self.config, default.execution_time_ms, stats_analyze.execution_time_ms):
- self.almost_same_execution_time.append(queries_tuple)
- elif default.execution_time_ms < stats_analyze.execution_time_ms:
- self.worse_execution_time.append(queries_tuple)
- else:
- self.improved_execution_time.append(queries_tuple)
-
- def build_report(self):
- # link to top
- self.report += "\n[#top]\n== All results by analysis type\n"
- # different results links
- self.report += "\n<>\n"
- self.report += "\n<>\n"
- self.report += "\n<>\n"
- self.report += "\n<>\n"
- self.report += "\n<>\n"
-
- self.report += f"\n[#error]\n== ERROR: Different EXPLAIN and EXPLAIN ANALYZE plans ({len(self.different_explain_plans)})\n\n"
- for query in self.different_explain_plans:
- self.__report_query(*query)
-
- self.report += f"\n[#worse]\n== Worse execution time queries ({len(self.worse_execution_time)})\n\n"
- for query in self.worse_execution_time:
- self.__report_query(*query)
-
- self.report += f"\n[#same_time]\n== Almost same execution time queries ({len(self.almost_same_execution_time)})\n\n"
- for query in self.almost_same_execution_time:
- self.__report_query(*query)
-
- self.report += f"\n[#improved]\n== Improved execution time ({len(self.improved_execution_time)})\n\n"
- for query in self.improved_execution_time:
- self.__report_query(*query)
-
- self.report += f"\n[#same_plan]\n\n== Same execution plan ({len(self.same_execution_plan)})\n\n"
- for query in self.same_execution_plan:
- self.__report_query(*query)
-
- # noinspection InsecureHash
- def __report_query(self,
- default: Query,
- default_analyze: Query,
- analyze: Query,
- analyze_analyze: Query,
- all: Query,
- all_analyze: Query):
- self.reported_queries_counter += 1
-
- self.report += f"=== Query {default.query_hash}"
- self.report += f"\n{default.tag}\n"
- self.report += "\n<>\n"
- self._add_double_newline()
-
- self._start_source(["sql"])
- self.report += format_sql(default.query.replace("|", "\|"))
- self._end_source()
-
- self._add_double_newline()
-
- self._start_table("7")
- self.report += "|Metric|Default|Default+QA|TA|TA + QA|S+TA|S+TA+QA\n"
- self._start_table_row()
- self.report += f"Cardinality|{default.result_cardinality}|{default_analyze.result_cardinality}|" \
- f"{analyze.result_cardinality}|{analyze_analyze.result_cardinality}|" \
- f"{all.result_cardinality}|{all_analyze.result_cardinality}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Optimizer cost|{default.execution_plan.get_estimated_cost()}|{default_analyze.execution_plan.get_estimated_cost()}|" \
- f"{analyze.execution_plan.get_estimated_cost()}|{analyze_analyze.execution_plan.get_estimated_cost()}|" \
- f"{all.execution_plan.get_estimated_cost()}|{all_analyze.execution_plan.get_estimated_cost()}"
- self._end_table_row()
- self._start_table_row()
- self.report += f"Execution time|{default.execution_time_ms}|{default_analyze.execution_time_ms}|" \
- f"{analyze.execution_time_ms}|{analyze_analyze.execution_time_ms}|" \
- f"{all.execution_time_ms}|{all_analyze.execution_time_ms}"
- self._end_table_row()
- self._end_table()
-
- self._start_table()
-
- self._start_table_row()
-
- self._start_collapsible("Default approach plan (w/o analyze)")
- self._start_source(["diff"])
- self.report += default.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible("Default approach plan with EXPLAIN ANALYZE (w/o analyze)")
- self._start_source(["diff"])
- self.report += default_analyze.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible("Plan with analyzed table (w/ analyze)")
- self._start_source(["diff"])
- self.report += analyze.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible("Plan with analyzed table with EXPLAIN ANALYZE (w/ analyze)")
- self._start_source(["diff"])
- self.report += analyze_analyze.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible("Stats + table analyze (w/ analyze and statistics)")
- self._start_source(["diff"])
- self.report += all.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_collapsible(
- "Stats + table analyze with EXPLAIN ANALYZE (w/ analyze and statistics)")
- self._start_source(["diff"])
- self.report += all_analyze.execution_plan.full_str
- self._end_source()
- self._end_collapsible()
-
- self._start_source(["diff"])
-
- diff = self._get_plan_diff(default.execution_plan.full_str,
- all_analyze.execution_plan.full_str)
- if not diff:
- diff = default.execution_plan.full_str
-
- self.report += diff
- self._end_source()
- self._end_table_row()
-
- self.report += "\n"
-
- self._end_table()
-
- self._add_double_newline()
diff --git a/src/reports/xls/regression.py b/src/reports/xls/regression.py
deleted file mode 100644
index 888ea20e..00000000
--- a/src/reports/xls/regression.py
+++ /dev/null
@@ -1,81 +0,0 @@
-from typing import Type
-
-from sql_formatter.core import format_sql
-
-from objects import ListOfQueries, Query
-from reports.abstract import Report
-
-
-class RegressionXlsReport(Report):
- def __init__(self):
- super().__init__()
-
- self.logger.info(f"Created report folder for this run at 'report/{self.start_date}'")
-
- self.queries = {}
-
- @classmethod
- def generate_report(cls, first_loq: ListOfQueries, second_loq: ListOfQueries):
- report = RegressionXlsReport()
-
- for qid, query in enumerate(first_loq.queries):
- report.add_query(query, second_loq.queries[qid])
-
- report.build_report()
-
- def get_report_name(self):
- return "regression"
-
- def define_version(self, version):
- pass
-
- def add_query(self, query: Type[Query], pg: Type[Query] | None):
- if query.tag not in self.queries:
- self.queries[query.tag] = [[query, pg], ]
- else:
- self.queries[query.tag].append([query, pg])
-
- def build_report(self):
- import xlsxwriter
-
- workbook = xlsxwriter.Workbook(f'report/{self.start_date}/report_regression.xls')
- worksheet = workbook.add_worksheet()
-
- head_format = workbook.add_format()
- head_format.set_bold()
- head_format.set_bg_color('#999999')
-
- eq_format = workbook.add_format()
- eq_format.set_bold()
- eq_format.set_bg_color('#d9ead3')
-
- eq_bad_format = workbook.add_format()
- eq_bad_format.set_bold()
- eq_bad_format.set_bg_color('#fff2cc')
-
- worksheet.write(0, 0, "First", head_format)
- worksheet.write(0, 1, "Second", head_format)
- worksheet.write(0, 2, "Ratio", head_format)
- worksheet.write(0, 3, "Query", head_format)
- worksheet.write(0, 4, "Query Hash", head_format)
-
- row = 1
- # Iterate over the data and write it out row by row.
- for tag, queries in self.queries.items():
- for query in queries:
- first_query: Query = query[0]
- second_query: Query = query[1]
-
- ratio = second_query.execution_time_ms / (
- first_query.execution_time_ms) if first_query.execution_time_ms != 0 else 99999999
- ratio_color = eq_bad_format if ratio > 1.0 else eq_format
-
- worksheet.write(row, 0, '{:.2f}'.format(first_query.execution_time_ms))
- worksheet.write(row, 1,
- f"{'{:.2f}'.format(second_query.execution_time_ms)}")
- worksheet.write(row, 2, f'{ratio}', ratio_color)
- worksheet.write(row, 3, f'{format_sql(first_query.query)}')
- worksheet.write(row, 4, f'{first_query.query_hash}')
- row += 1
-
- workbook.close()
diff --git a/src/reports/xls/score.py b/src/reports/xls/score.py
deleted file mode 100644
index 48b51c2b..00000000
--- a/src/reports/xls/score.py
+++ /dev/null
@@ -1,183 +0,0 @@
-from typing import Type
-
-from matplotlib import pyplot as plt
-from sql_formatter.core import format_sql
-
-from objects import ListOfQueries, Query
-from db.postgres import PostgresQuery
-from reports.abstract import Report
-
-
-class ScoreXlsReport(Report):
- def __init__(self):
- super().__init__()
-
- self.logger.info(f"Created report folder for this run at 'report/{self.start_date}'")
-
- self.queries = {}
-
- @classmethod
- def generate_report(cls, loq: ListOfQueries, pg_loq: ListOfQueries = None):
- report = ScoreXlsReport()
-
- for qid, query in enumerate(loq.queries):
- report.add_query(query, pg_loq.queries[qid] if pg_loq else None)
-
- report.build_report()
-
- def get_report_name(self):
- return "score"
-
- def define_version(self, version):
- self.report += f"[VERSION]\n====\n{version}\n====\n\n"
-
- def calculate_score(self, query):
- if query.execution_time_ms == 0:
- return -1
- else:
- return "{:.2f}".format(
- query.get_best_optimization(
- self.config).execution_time_ms / query.execution_time_ms)
-
- def create_plot(self, best_optimization, optimizations, query):
- plt.xlabel('Execution time')
- plt.ylabel('Optimizer cost')
-
- plt.plot([q.execution_time_ms for q in optimizations if q.execution_time_ms != 0],
- [q.execution_plan.get_estimated_cost() for q in optimizations if q.execution_time_ms != 0], 'k.',
- [query.execution_time_ms],
- [query.execution_plan.get_estimated_cost()], 'r^',
- [best_optimization.execution_time_ms],
- [best_optimization.execution_plan.get_estimated_cost()], 'go')
-
- file_name = f'imgs/query_{self.reported_queries_counter}.png'
- plt.savefig(f"report/{self.start_date}/{file_name}")
- plt.close()
-
- return file_name
-
- def add_query(self, query: Type[Query], pg: Query | None):
- if query.tag not in self.queries:
- self.queries[query.tag] = [[query, pg], ]
- else:
- self.queries[query.tag].append([query, pg])
-
- def build_report(self):
- import xlsxwriter
-
- workbook = xlsxwriter.Workbook(f'report/{self.start_date}/report_score.xls')
- worksheet = workbook.add_worksheet()
-
- head_format = workbook.add_format()
- head_format.set_bold()
- head_format.set_bg_color('#999999')
-
- eq_format = workbook.add_format()
- eq_format.set_bold()
- eq_format.set_bg_color('#d9ead3')
-
- eq_bad_format = workbook.add_format()
- eq_bad_format.set_bold()
- eq_bad_format.set_bg_color('#fff2cc')
-
- eq_good_format = workbook.add_format()
- eq_good_format.set_bold()
- eq_good_format.set_bg_color('#d9ead3')
-
- bm_format = workbook.add_format()
- bm_format.set_bold()
- bm_format.set_bg_color('#cfe2f3')
-
- pg_comparison_format = workbook.add_format()
- pg_comparison_format.set_bold()
- pg_comparison_format.set_bg_color('#fce5cd')
-
- # Start from the first cell. Rows and columns are zero indexed.
- yb_bests = 0
- pg_bests = 0
- total = 0
- for queries in self.queries.values():
- for query in queries:
- yb_query = query[0]
- pg_query = query[1]
-
- yb_best = yb_query.get_best_optimization(self.config, )
- pg_best = pg_query.get_best_optimization(self.config, )
-
- yb_bests += 1 if yb_query.compare_plans(yb_best.execution_plan) else 0
- pg_bests += 1 if pg_query.compare_plans(pg_best.execution_plan) else 0
-
- total += 1
-
- worksheet.write(0, 0, "YB", head_format)
- worksheet.write(0, 1, "YB Best", head_format)
- worksheet.write(0, 2, "PG", head_format)
- worksheet.write(0, 3, "PG Best", head_format)
- worksheet.write(0, 4, "Ratio YB vs PG", head_format)
- worksheet.write(0, 5, "Best YB vs PG", head_format)
- worksheet.write(0, 6, "Query", head_format)
- worksheet.write(0, 7, "Query Hash", head_format)
-
- row = 1
- # Iterate over the data and write it out row by row.
- for tag, queries in self.queries.items():
- for query in queries:
- yb_query: PostgresQuery = query[0]
- pg_query: PostgresQuery = query[1]
-
- yb_best = yb_query.get_best_optimization(self.config, )
- pg_best = pg_query.get_best_optimization(self.config, )
-
- default_yb_equality = yb_query.compare_plans(yb_best.execution_plan)
- default_pg_equality = pg_query.compare_plans(pg_best.execution_plan)
-
- default_yb_pg_equality = yb_query.compare_plans(pg_query.execution_plan)
- best_yb_pg_equality = yb_best.compare_plans(pg_best.execution_plan)
-
- ratio_x3 = yb_query.execution_time_ms / (
- 3 * pg_query.execution_time_ms) if pg_query.execution_time_ms != 0 else 99999999
- ratio_x3_str = "{:.2f}".format(
- yb_query.execution_time_ms / pg_query.execution_time_ms if pg_query.execution_time_ms != 0 else 99999999)
- ratio_color = ratio_x3 > 1.0
-
- ratio_best = yb_best.execution_time_ms / (
- 3 * pg_best.execution_time_ms) if yb_best.execution_time_ms != 0 else 99999999
- ratio_best_x3_str = "{:.2f}".format(
- yb_best.execution_time_ms / pg_best.execution_time_ms if yb_best.execution_time_ms != 0 else 99999999)
- ratio_best_color = ratio_best > 1.0
-
- bitmap_flag = "bitmap" in pg_query.execution_plan.full_str.lower()
-
- best_pg_format = None
- if ratio_best_color and best_yb_pg_equality:
- best_pg_format = eq_bad_format
- elif best_yb_pg_equality:
- best_pg_format = eq_good_format
- elif ratio_best_color:
- best_pg_format = pg_comparison_format
-
- df_pf_format = None
- if ratio_color and default_yb_pg_equality:
- df_pf_format = eq_bad_format
- elif default_yb_pg_equality:
- df_pf_format = eq_good_format
- elif ratio_color:
- df_pf_format = pg_comparison_format
-
- worksheet.write(row, 0, '{:.2f}'.format(yb_query.execution_time_ms))
- worksheet.write(row, 1,
- f"{'{:.2f}'.format(yb_best.execution_time_ms)}",
- eq_format if default_yb_equality else None)
- worksheet.write(row, 2,
- f"{'{:.2f}'.format(pg_query.execution_time_ms)}",
- bm_format if bitmap_flag else None)
- worksheet.write(row, 3,
- f"{'{:.2f}'.format(pg_best.execution_time_ms)}",
- eq_format if default_pg_equality else None)
- worksheet.write(row, 4, f"{ratio_x3_str}", df_pf_format)
- worksheet.write(row, 5, f"{ratio_best_x3_str}", best_pg_format)
- worksheet.write(row, 6, f'{format_sql(pg_query.query)}')
- worksheet.write(row, 7, f'{pg_query.query_hash}')
- row += 1
-
- workbook.close()
diff --git a/src/runner.py b/src/runner.py
old mode 100644
new mode 100755
index a88439e6..b0a261b6
--- a/src/runner.py
+++ b/src/runner.py
@@ -1,61 +1,98 @@
import argparse
+from os.path import exists
from pyhocon import ConfigFactory
+from actions.reports.score_stats import ScoreStatsReport
from config import Config, init_logger, ConnectionConfig, DDLStep
from db.factory import create_database
from db.postgres import DEFAULT_USERNAME, DEFAULT_PASSWORD, PostgresResultsLoader
-from reports.adoc.comparison import ComparisonReport
-from reports.adoc.regression import RegressionReport
-from reports.adoc.score import ScoreReport
-from reports.xls.score import ScoreXlsReport
-from reports.xls.regression import RegressionXlsReport
-from reports.adoc.selectivity import SelectivityReport
-from reports.adoc.taqo import TaqoReport
+from actions.reports.cost import CostReport
+from actions.reports.regression import RegressionReport
+from actions.reports.score import ScoreReport
+from actions.reports.selectivity import SelectivityReport
+from actions.reports.taqo import TaqoReport
+from actions.collect import CollectAction
-from scenario import Scenario
-from utils import get_bool_from_str
+from utils import get_bool_from_object, get_model_path
-def parse_ddls(ddl_ops):
+def parse_ddls(ddl_ops: str):
result = set()
if ddl_ops == "none":
return result
- if "database" in ddl_ops:
- result.add(DDLStep.DATABASE)
- if "create" in ddl_ops:
- result.add(DDLStep.CREATE)
- if "import" in ddl_ops:
- result.add(DDLStep.IMPORT)
- if "drop" in ddl_ops:
- result.add(DDLStep.DROP)
- if "analyze" in ddl_ops:
- result.add(DDLStep.ANALYZE)
+ ddl_ops = ddl_ops.lower()
+ for e in DDLStep:
+ if str(e.name).lower() in ddl_ops:
+ result.add(e)
return result
+def parse_model_config(model):
+ path_to_file = f"{get_model_path(model)}/model.conf"
+
+ if exists(path_to_file):
+ parsed_model_config = ConfigFactory.parse_file(path_to_file)
+ global_option_index = get_bool_from_object(configuration.get("all-index-check", True))
+ global_option_timeout = configuration.get("test-query-timeout", 1200)
+ global_compaction_timeout = configuration.get("compaction-timeout", 120)
+
+ configuration['all-index-check'] = global_option_index and parsed_model_config.get("all-index-check", True)
+ configuration['load-catalog-tables'] = parsed_model_config.get("load-catalog-tables", False)
+ configuration['test-query-timeout'] = parsed_model_config.get("test-query-timeout", global_option_timeout)
+ configuration['compaction-timeout'] = parsed_model_config.get("compaction-timeout", global_compaction_timeout)
+
+
+def define_database_name(args):
+ if args.database:
+ return args.database
+ else:
+ if args.colocated:
+ return f"taqo_{model.replace('-', '_')}"
+ else:
+ return f"taqo_{model.replace('-', '_')}_non_colocated"
+
+
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Query Optimizer Testing framework for PostgreSQL compatible DBs')
parser.add_argument('action',
help='Action to perform - collect or report')
+ parser.add_argument('--options',
+ help='Options to overwrite configuration file properties',
+ nargs='+', type=str)
parser.add_argument('--db',
default="yugabyte",
help='Database to run against')
+ parser.add_argument('--baseline',
+ default="",
+ help='Link to baseline run results (JSON)')
parser.add_argument('--config',
default="config/default.conf",
help='Configuration file path')
+ parser.add_argument('-p', '--yugabyte-bin-path',
+ default="",
+ help='Path to Yugabyte distrib binary files')
+ parser.add_argument('-m', '--yugabyte-master-addresses',
+ default="",
+ help='List of Yugabyte master nodes to use yb-admin command')
+ parser.add_argument('--yugabyte-stats',
+ action=argparse.BooleanOptionalAction,
+ default=True,
+ help='Do collect stats from pg_stat_statements_reset')
parser.add_argument('--type',
- help='Report type - taqo, regression, comparison or selectivity')
+ help='Report type - taqo, score, regression, comparison, selectivity or cost')
+
+ # report mode flags
- # TAQO or Comparison
+ # TAQO, Score, Comparison or Cost (--pg-results optional for TAQO, N/A for Cost)
parser.add_argument('--results',
default=None,
help='TAQO/Comparison: Path to results with optimizations for YB')
@@ -96,6 +133,14 @@ def parse_ddls(ddl_ops):
default=None,
help='Results with table analyze and enabled statistics and EXPLAIN ANALYZE')
+ # Cost
+ parser.add_argument('--interactive',
+ action=argparse.BooleanOptionalAction,
+ default=False,
+ help='Popup an interactive chart then quit (no boxplot chart support)')
+
+ # collect mode flags
+
parser.add_argument('--ddl-prefix',
default="",
help='DDL file prefix (default empty, might be postgres)')
@@ -107,6 +152,10 @@ def parse_ddls(ddl_ops):
action=argparse.BooleanOptionalAction,
default=False,
help='Collect only execution plans, execution time will be equal to cost')
+ parser.add_argument('--bitmap-enabled',
+ action=argparse.BooleanOptionalAction,
+ default=False,
+ help='Enable bitmap scan for PG and YB databases')
parser.add_argument('--optimizations',
action=argparse.BooleanOptionalAction,
default=False,
@@ -123,13 +172,17 @@ def parse_ddls(ddl_ops):
parser.add_argument('--revision',
help='Git revision or path to release build')
parser.add_argument('--ddls',
- default="database,create,analyze,import,drop",
- help='Model creation queries, comma separated: database,create,analyze,import,drop')
+ default="database,create,analyze,import,compact,drop",
+ help='Model creation queries, comma separated: database,create,analyze,import,compact,drop')
parser.add_argument('--clean-db',
action=argparse.BooleanOptionalAction,
- default=True,
+ default=False,
help='Keep database after test')
+ parser.add_argument('--colocated',
+ action=argparse.BooleanOptionalAction,
+ default=True,
+ help='Do create database with compaction flag (Yugabyte only, default true)')
parser.add_argument('--allow-destroy-db',
action=argparse.BooleanOptionalAction,
default=True,
@@ -163,16 +216,15 @@ def parse_ddls(ddl_ops):
default=DEFAULT_PASSWORD,
help='Password for user for connection')
parser.add_argument('--database',
- default="taqo",
help='Target database in postgres compatible database')
- parser.add_argument('--enable-statistics',
- action=argparse.BooleanOptionalAction,
- default=False,
- help='Evaluate yb_enable_optimizer_statistics before running queries')
parser.add_argument('--explain-clause',
default=None,
help='Explain clause that will be placed before query. Default "EXPLAIN"')
+ parser.add_argument('--server-side-execution',
+ action=argparse.BooleanOptionalAction,
+ default=False,
+ help='Evaluate queries on server side, for PG using "EXPLAIN ANALYZE"')
parser.add_argument('--session-props',
default="",
help='Additional session properties queries')
@@ -187,10 +239,16 @@ def parse_ddls(ddl_ops):
parser.add_argument('--output',
help='Output JSON file name in report folder, [.json] will be added')
+ # collect/report mode common flags
+
parser.add_argument('--clear',
action=argparse.BooleanOptionalAction,
default=False,
help='Clear logs directory')
+ parser.add_argument('--exit-on-fail',
+ action=argparse.BooleanOptionalAction,
+ default=False,
+ help='Exit on query failures (DDL failure is not configurable)')
parser.add_argument('--yes',
action=argparse.BooleanOptionalAction,
@@ -205,9 +263,30 @@ def parse_ddls(ddl_ops):
configuration = ConfigFactory.parse_file(args.config)
ddls = parse_ddls(args.ddls)
+ model = args.model
+
+ parse_model_config(model)
+
+ options_config = {}
+ if args.options:
+ for option in args.options:
+ key, value = option.split('=', 1)
+ if value.startswith('int:'):
+ value = int(value.replace('int:', ''))
+ elif value.startswith('bool:'):
+ """
+ --to key1=bool:True
+ """
+ value = value.replace('bool:', '').lower() == 'true'
+
+ options_config[key] = value
+
+ configuration = configuration | options_config
+ loader = PostgresResultsLoader()
config = Config(
logger=init_logger("DEBUG" if args.verbose else "INFO"),
+ exit_on_fail=args.exit_on_fail,
source_path=args.source_path or configuration.get("source-path", None),
num_nodes=int(args.num_nodes) or configuration.get("num-nodes", 3),
@@ -218,42 +297,50 @@ def parse_ddls(ddl_ops):
clean_db=args.clean_db,
allow_destroy_db=args.allow_destroy_db,
clean_build=args.clean_build,
+ colocated_database=args.colocated,
+ bitmap_enabled=args.bitmap_enabled,
+ yugabyte_bin_path=args.yugabyte_bin_path or configuration.get("yugabyte-bin-path", None),
+ yugabyte_master_addresses=args.yugabyte_master_addresses if args.yugabyte_master_addresses else args.host,
+ yugabyte_collect_stats=args.yugabyte_stats,
connection=ConnectionConfig(host=args.host,
port=args.port,
username=args.username,
password=args.password,
- database=args.database),
+ database=define_database_name(args)),
- model=args.model,
+ model=model,
+ all_index_check=configuration.get("all-index-check", True),
+ load_catalog_tables=configuration.get("load-catalog-tables", False),
+ baseline_path=args.baseline,
+ baseline_results=loader.get_queries_from_previous_result(args.baseline) if args.baseline else None,
output=args.output,
ddls=ddls,
remote_data_path=args.remote_data_path,
- ddl_prefix=args.ddl_prefix if args.ddl_prefix else (
- args.db if args.db != "yugabyte" else ""
- ),
+ ddl_prefix=args.ddl_prefix or (args.db if args.db != "yugabyte" else ""),
with_optimizations=args.optimizations,
plans_only=args.plans_only,
+ server_side_execution=get_bool_from_object(args.server_side_execution),
- enable_statistics=args.enable_statistics or get_bool_from_str(
- configuration.get("enable-statistics", False)),
explain_clause=args.explain_clause or configuration.get("explain-clause", "EXPLAIN"),
- session_props=configuration.get("session-props", [qr for qr in args.session_props.split(",") if qr]),
+ session_props=configuration.get("session-props") +
+ (args.session_props.split(",") if args.session_props else []),
basic_multiplier=int(args.basic_multiplier),
- skip_percentage_delta=configuration.get("skip-percentage-delta", 0.05),
- skip_timeout_delta=configuration.get("skip-timeout-delta", 1),
- ddl_query_timeout=configuration.get("ddl-query-timeout", 3600),
- test_query_timeout=configuration.get("test-query-timeout", 1200),
- look_near_best_plan=configuration.get("look-near-best-plan", True),
- all_pairs_threshold=configuration.get("all-pairs-threshold", 3),
+ skip_percentage_delta=float(configuration.get("skip-percentage-delta", 0.05)),
+ skip_timeout_delta=int(configuration.get("skip-timeout-delta", 1)),
+ ddl_query_timeout=int(configuration.get("ddl-query-timeout", 3600)),
+ test_query_timeout=int(configuration.get("test-query-timeout", 1200)),
+ compaction_timeout=int(configuration.get("compaction-timeout", 120)),
+ look_near_best_plan=get_bool_from_object(configuration.get("look-near-best-plan", True)),
+ all_pairs_threshold=int(configuration.get("all-pairs-threshold", 3)),
num_queries=int(args.num_queries)
if int(args.num_queries) > 0 else configuration.get("num-queries", -1),
- num_retries=configuration.get("num-retries", 5),
- num_warmup=configuration.get("num-warmup", 2),
+ num_retries=int(configuration.get("num-retries", 5)),
+ num_warmup=int(configuration.get("num-warmup", 1)),
- parametrized=args.parametrized,
+ parametrized=get_bool_from_object(args.parametrized),
asciidoctor_path=configuration.get("asciidoctor-path", "asciidoc"),
@@ -264,14 +351,11 @@ def parse_ddls(ddl_ops):
config.logger.info("------------------------------------------------------------")
config.logger.info("Query Optimizer Testing Framework for Postgres compatible DBs")
- loader = PostgresResultsLoader()
-
if args.action == "collect":
config.logger.info("")
config.logger.info(f"Collecting results for model: {config.model}")
config.logger.info("Configuration:")
- for line in str(config).split("\n"):
- config.logger.info(line)
+ config.logger.info(str(config))
config.logger.info("------------------------------------------------------------")
if args.output is None:
@@ -282,8 +366,7 @@ def parse_ddls(ddl_ops):
input("Validate configuration carefully and press Enter...")
config.logger.info("Evaluating scenario")
- sc = Scenario(config)
- sc.evaluate()
+ CollectAction().evaluate()
elif args.action == "report":
config.logger.info("")
config.logger.info(f"Generation {args.type} report")
@@ -303,37 +386,18 @@ def parse_ddls(ddl_ops):
args.pg_results) if args.pg_results else None
ScoreReport.generate_report(yb_queries, pg_queries)
- elif args.type == "score_xls":
+ elif args.type == "score_stats":
yb_queries = loader.get_queries_from_previous_result(args.results)
pg_queries = loader.get_queries_from_previous_result(
args.pg_results) if args.pg_results else None
- ScoreXlsReport.generate_report(yb_queries, pg_queries)
+ ScoreStatsReport.generate_report(yb_queries, pg_queries)
elif args.type == "regression":
- report = RegressionReport()
-
v1_queries = loader.get_queries_from_previous_result(args.v1_results)
v2_queries = loader.get_queries_from_previous_result(args.v2_results)
- report.generate_report(args.v1_name, args.v2_name, v1_queries, v2_queries)
- elif args.type == "regression_xls":
- report = RegressionXlsReport()
-
- v1_queries = loader.get_queries_from_previous_result(args.v1_results)
- v2_queries = loader.get_queries_from_previous_result(args.v2_results)
-
- report.generate_report(v1_queries, v2_queries)
- elif args.type == "comparison":
- report = ComparisonReport()
-
- yb_queries = loader.get_queries_from_previous_result(args.results)
- pg_queries = loader.get_queries_from_previous_result(
- args.pg_results) if args.pg_results else None
-
- report.generate_report(yb_queries, pg_queries)
+ RegressionReport.generate_report(args.v1_name, args.v2_name, v1_queries, v2_queries)
elif args.type == "selectivity":
- report = SelectivityReport()
-
default_queries = loader.get_queries_from_previous_result(args.default_results)
default_analyze_queries = loader.get_queries_from_previous_result(
args.default_analyze_results)
@@ -343,7 +407,16 @@ def parse_ddls(ddl_ops):
stats_analyze_queries = loader.get_queries_from_previous_result(
args.stats_analyze_results)
- report.generate_report(default_queries, default_analyze_queries, ta_queries,
- ta_analyze_queries, stats_queries, stats_analyze_queries)
+ SelectivityReport.generate_report(default_queries, default_analyze_queries, ta_queries,
+ ta_analyze_queries, stats_queries, stats_analyze_queries)
+ elif args.type == "cost":
+ yb_queries = loader.get_queries_from_previous_result(args.results)
+ CostReport.generate_report(yb_queries, args.interactive)
else:
- raise AttributeError(f"Unknown test type defined {config.test}")
+ raise AttributeError(f"Unknown report type defined {args.type}")
+
+ if config.has_failures:
+ config.logger.exception("Found issues during TAQO collect execution")
+ exit(1)
+ if config.has_warnings:
+ exit(2)
diff --git a/src/scenario.py b/src/scenario.py
deleted file mode 100644
index aac11bd7..00000000
--- a/src/scenario.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import subprocess
-
-import psycopg2
-from tqdm import tqdm
-
-from models.factory import get_test_model
-from utils import evaluate_sql, calculate_avg_execution_time, get_md5
-
-
-class Scenario:
- def __init__(self, config):
- self.config = config
- self.logger = self.config.logger
- self.sut_database = self.config.database
-
- def start_db(self):
- self.logger.info(f"Initializing {self.sut_database.__class__.__name__} DB")
-
- commit_hash = self.config.revision
-
- self.sut_database.change_version_and_compile(commit_hash)
- self.sut_database.stop_database()
- self.sut_database.destroy()
- self.sut_database.start_database()
-
- return self.get_commit_message(commit_hash)
-
- def get_commit_message(self, commit_hash):
- if commit_hash:
- output = str(subprocess.check_output(
- f"echo `git log -n 1 --pretty=format:%s {commit_hash}`",
- cwd=self.config.source_path,
- shell=True)).rstrip('\n')
- return f"{output} ({commit_hash})"
- else:
- return ""
-
- def evaluate(self):
- loader = self.config.database.get_results_loader()
-
- commit_message = self.start_db()
- try:
- self.sut_database.create_test_database()
-
- self.sut_database.establish_connection(self.config.connection.database)
-
- loq = self.config.database.get_list_queries()
- loq.db_version = self.sut_database.connection.get_version()
- loq.model_queries, loq.queries = self.run_ddl_and_testing_queries(
- self.sut_database.connection.conn, self.config.with_optimizations)
- loq.git_message = commit_message
-
- self.logger.info(f"Storing results to report/{self.config.output}")
- loader.store_queries_to_file(loq, self.config.output)
- except Exception as e:
- self.logger.exception(e)
- raise e
- finally:
- if self.config.clean_db:
- self.sut_database.stop_database()
-
- def run_ddl_and_testing_queries(self,
- connection,
- evaluate_optimizations=False):
- queries = []
- model_queries = []
- try:
- model = get_test_model()
- created_tables, model_queries = model.create_tables(connection)
- queries = model.get_queries(created_tables)
- except Exception as e:
- self.logger.exception("Failed to evaluate DDL queries", e)
- exit(1)
-
- connection.autocommit = False
- self.evaluate_testing_queries(connection, queries, evaluate_optimizations)
-
- return model_queries, queries
-
- def evaluate_testing_queries(self, conn, queries, evaluate_optimizations):
- counter = 1
- for original_query in queries:
- with conn.cursor() as cur:
- self.sut_database.prepare_query_execution(cur)
-
- try:
- self.sut_database.set_query_timeout(cur, self.config.test_query_timeout)
-
- short_query = original_query.query.replace('\n', '')[:40]
- self.logger.info(
- f"Evaluating query {short_query}... [{counter}/{len(queries)}]")
-
- try:
- evaluate_sql(cur, original_query.get_explain())
- original_query.execution_plan = self.config.database.get_execution_plan(
- '\n'.join(
- str(item[0]) for item in cur.fetchall()))
-
- conn.rollback()
- self.sut_database.prepare_query_execution(cur)
- except psycopg2.errors.QueryCanceled:
- try:
- evaluate_sql(cur, original_query.get_heuristic_explain())
- original_query.execution_plan = self.config.database.get_execution_plan(
- '\n'.join(
- str(item[0]) for item in cur.fetchall()))
-
- conn.rollback()
- self.sut_database.prepare_query_execution(cur)
- except psycopg2.errors.QueryCanceled:
- self.logger.error("Unable to get execution plan even w/o analyze")
- original_query.execution_plan = self.config.database.get_execution_plan(
- '')
-
- if self.config.plans_only:
- original_query.execution_time_ms = \
- original_query.execution_plan.get_estimated_cost()
- else:
- calculate_avg_execution_time(cur, original_query, self.sut_database,
- num_retries=int(self.config.num_retries),
- connection=conn)
-
- if evaluate_optimizations and "dml" not in original_query.optimizer_tips.tags:
- self.logger.debug("Evaluating optimizations...")
- self.evaluate_optimizations(conn, cur, original_query)
-
- except psycopg2.Error as pe:
- # do not raise exception
- self.logger.exception(f"{original_query}\nFailed because of {pe}")
- except Exception as e:
- self.logger.info(original_query)
- raise e
- finally:
- counter += 1
-
- conn.rollback()
-
- def evaluate_optimizations(self, connection, cur, original_query):
- # build all possible optimizations
- database = self.config.database
- list_of_optimizations = database.get_list_optimizations(original_query)
-
- self.logger.debug(f"{len(list_of_optimizations)} optimizations generated")
- progress_bar = tqdm(list_of_optimizations)
- num_skipped = 0
- min_execution_time = original_query.execution_time_ms if original_query.execution_time_ms > 0 else (
- self.config.test_query_timeout * 1000)
- original_query.optimizations = []
- execution_plans_checked = set()
-
- for optimization in progress_bar:
- # in case of enable statistics enabled
- # we can get failure here and throw timeout
- original_query.optimizations.append(optimization)
-
- # set maximum execution time if this is first query,
- # or we are evaluating queries near best execution time
- if self.config.look_near_best_plan or len(original_query.optimizations) == 1:
- optimizer_query_timeout = \
- (original_query.optimizer_tips and original_query.optimizer_tips.max_timeout) or \
- f"{int(min_execution_time / 1000) + int(self.config.skip_timeout_delta)}"
-
- self.sut_database.set_query_timeout(cur, optimizer_query_timeout)
-
- self.try_to_get_default_explain_hints(cur, optimization, original_query)
-
- try:
- evaluate_sql(cur, optimization.get_explain())
- optimization.execution_plan = database.get_execution_plan(
- '\n'.join(
- str(item[0]) for item in cur.fetchall()))
-
- connection.rollback()
- self.sut_database.prepare_query_execution(cur)
- except psycopg2.errors.QueryCanceled as e:
- # failed by timeout - it's ok just skip optimization
- self.logger.debug(f"Getting execution plan failed with {e}")
-
- num_skipped += 1
- optimization.execution_time_ms = 0
- optimization.execution_plan = database.get_execution_plan("")
- continue
-
- exec_plan_md5 = get_md5(optimization.execution_plan.get_clean_plan())
- not_unique_plan = exec_plan_md5 in execution_plans_checked
- execution_plans_checked.add(exec_plan_md5)
-
- if self.config.plans_only:
- original_query.execution_time_ms = \
- original_query.execution_plan.get_estimated_cost()
- elif not_unique_plan or not calculate_avg_execution_time(
- cur,
- optimization,
- self.sut_database,
- num_retries=int(self.config.num_retries),
- connection=connection):
- num_skipped += 1
-
- # get new minimum execution time
- if optimization.execution_time_ms != 0 and \
- optimization.execution_time_ms < min_execution_time:
- min_execution_time = optimization.execution_time_ms
-
- progress_bar.set_postfix(
- {'skipped': num_skipped, 'min_execution_time_ms': min_execution_time})
-
- return list_of_optimizations
-
- def try_to_get_default_explain_hints(self, cur, optimization, original_query):
- if not original_query.explain_hints:
- if self.config.enable_statistics or optimization.execution_plan is None:
- evaluate_sql(cur, optimization.get_heuristic_explain())
-
- execution_plan = self.config.database.get_execution_plan('\n'.join(
- str(item[0]) for item in cur.fetchall()))
- else:
- execution_plan = optimization.execution_plan
-
- if original_query.compare_plans(execution_plan) and original_query.tips_looks_fair(
- optimization):
- # store execution plan hints from optimization
- original_query.explain_hints = optimization.explain_hints
diff --git a/src/utils.py b/src/utils.py
index 13a3563d..faaa1461 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -1,15 +1,18 @@
+import difflib
import hashlib
+import json
import re
import time
import traceback
+import pglast
from copy import copy
import psycopg2
-from sql_metadata import Parser
+from psycopg2._psycopg import cursor
from config import Config
from db.database import Database
-from objects import Query
+from objects import Query, FieldInTableHelper
PARAMETER_VARIABLE = r"[^'](\%\((.*?)\))"
WITH_ORDINALITY = r"[Ww][Ii][Tt][Hh]\s*[Oo][Rr][Dd][Ii][Nn][Aa][Ll][Ii][Tt][yY]\s*[Aa][Ss]\s*.*(.*)"
@@ -32,20 +35,42 @@ def remove_with_ordinality(sql_str):
return sql_str
-def get_result(cur, is_dml):
+def get_result_for_consistency_check(cur, is_dml: bool, has_order_by: bool, has_limit: bool):
+ if is_dml:
+ return cur.rowcount, f"{cur.rowcount} updates"
+
+ # if there is a limit without order by we can't validate results
+ str_result = []
+ cardinality = 0
+ if has_limit and not has_order_by:
+ str_result = ["LIMIT_WITHOUT_ORDER_BY"]
+ else:
+ result = cur.fetchall()
+
+ for row in result:
+ cardinality += 1
+ for column_value in row:
+ str_result.append(f"{str(column_value)}")
+
+ if not has_order_by:
+ str_result.sort()
+
+ return cardinality, ''.join(str_result)
+
+def get_result(cur, is_dml: bool):
if is_dml:
return cur.rowcount, f"{cur.rowcount} updates"
result = cur.fetchall()
- str_result = ""
+ str_result = []
cardinality = 0
for row in result:
cardinality += 1
for column_value in row:
- str_result += f"{str(column_value)}"
+ str_result.append(f"{str(column_value)}")
- return cardinality, str_result
+ return cardinality, ''.join(str_result)
def calculate_avg_execution_time(cur,
@@ -59,48 +84,61 @@ def calculate_avg_execution_time(cur,
query_str = query_str or query.get_query()
query_str_lower = query_str.lower() if query_str is not None else None
+ has_order_by = query.has_order_by
+ has_limit = True if " limit " in query_str_lower else False
+
with_analyze = query_with_analyze(query_str_lower)
is_dml = query_is_dml(query_str_lower)
- sum_execution_times = 0
+ execution_times = []
actual_evaluations = 0
# run at least one iteration
num_retries = max(num_retries, 2)
num_warmup = config.num_warmup
+ execution_plan_collected = False
+ stats_reset = False
for iteration in range(num_retries + num_warmup):
# noinspection PyUnresolvedReferences
try:
- start_time = current_milli_time()
- query.parameters = evaluate_sql(cur, query_str)
-
- result = None
- if iteration >= num_warmup and with_analyze:
- _, result = get_result(cur, is_dml)
- connection.rollback()
- # todo make wrapper for this
- sut_database.prepare_query_execution(cur)
-
- # get cardinality for queries with analyze
- evaluate_sql(cur, query.get_query())
- cardinality, _ = get_result(cur, is_dml)
- connection.rollback()
- sut_database.prepare_query_execution(cur)
-
- sum_execution_times += extract_execution_time_from_analyze(result)
- query.result_cardinality = cardinality
- else:
- sum_execution_times += current_milli_time() - start_time
+ if config.yugabyte_collect_stats and iteration >= num_warmup and not stats_reset:
+ sut_database.reset_query_statics(cur)
+ stats_reset = True
+
+ sut_database.prepare_query_execution(cur, query)
if iteration == 0:
- if not result:
- cardinality, result = get_result(cur, is_dml)
- query.result_cardinality = cardinality
- connection.rollback()
- sut_database.prepare_query_execution(cur)
+ # evaluate test query without analyze and collect result hash
+ # using first iteration as a result collecting step
+ # even if EXPLAIN ANALYZE is explain query
+ query.parameters = evaluate_sql(cur, query.get_query())
+ cardinality, result = get_result_for_consistency_check(cur, is_dml, has_order_by, has_limit)
+ query.result_cardinality = cardinality
query.result_hash = get_md5(result)
+ else:
+ if iteration < num_warmup:
+ query.parameters = evaluate_sql(cur, query_str)
+ _, result = get_result(cur, is_dml)
+ else:
+ if not execution_plan_collected:
+ collect_execution_plan(cur, connection, query, sut_database)
+ execution_plan_collected = True
+
+ # prepare execution again
+ sut_database.prepare_query_execution(cur, query)
+
+ start_time = current_milli_time()
+
+ evaluate_sql(cur, query_str)
+ config.logger.debug("SQL >> Getting results")
+ _, result = get_result(cur, is_dml)
+
+ if with_analyze:
+ execution_times.append(extract_execution_time_from_analyze(result))
+ else:
+ execution_times.append(current_milli_time() - start_time)
except psycopg2.errors.QueryCanceled:
# failed by timeout - it's ok just skip optimization
query.execution_time_ms = -1
@@ -114,14 +152,62 @@ def calculate_avg_execution_time(cur,
traceback.print_exc(limit=None, file=None, chain=True)
return False
finally:
+ rolled_back_tries = 0
+ rolled_back = False
+ while rolled_back_tries < 5:
+ rolled_back_tries += 1
+ try:
+ connection.rollback()
+ rolled_back = True
+ break
+ except Exception as e:
+ time.sleep(2)
+ if not rolled_back:
+ config.logger.error(
+ f"INTERNAL ERROR Failed to rollback transaction after failed query execution:\n{query_str}")
+
if iteration >= num_warmup:
actual_evaluations += 1
- query.execution_time_ms = sum_execution_times / actual_evaluations
+ # TODO convert execution_time_ms into a property
+ query.execution_time_ms = sum(execution_times) / len(execution_times)
+
+ if config.yugabyte_collect_stats:
+ sut_database.collect_query_statistics(cur, query, query_str)
return True
+def find_order_by_in_query(query_str_lower):
+ try:
+ statement_json = pglast.parser.parse_sql_json(query_str_lower)
+ statement_dict = json.loads(statement_json)
+ has_order_by = 'sortClause' in list(statement_dict["stmts"][0]['stmt'].values())[0]
+ except Exception:
+ has_order_by = False
+
+ return has_order_by
+
+
+def collect_execution_plan(cur,
+ connection,
+ query: Query,
+ sut_database: Database):
+ try:
+ evaluate_sql(cur, query.get_explain())
+ query.execution_plan = sut_database.get_execution_plan(
+ '\n'.join(str(item[0]) for item in cur.fetchall())
+ )
+
+ connection.rollback()
+ except psycopg2.errors.QueryCanceled as e:
+ # failed by timeout - it's ok just skip optimization
+ Config().logger.debug(f"Getting execution plan failed with {e}")
+
+ query.execution_time_ms = 0
+ query.execution_plan = sut_database.get_execution_plan("")
+
+
def query_with_analyze(query_str_lower):
return query_str_lower is not None and \
"explain" in query_str_lower and \
@@ -135,7 +221,28 @@ def query_is_dml(query_str_lower):
def extract_execution_time_from_analyze(result):
- matches = re.finditer(r"Execution\sTime:\s(\d+\.\d+)\sms", result, re.MULTILINE)
+ extracted = -1
+ matches = re.findall(r"(?> {sql}[{parameters}]")
cur.execute(sql, parameters)
except psycopg2.errors.QueryCanceled as e:
+ if not mute_exceptions:
+ config.logger.debug(f"UNSTABLE: {sql_wo_parameters}", sql)
cur.connection.rollback()
raise e
+ except psycopg2.errors.DuplicateDatabase as ddb:
+ cur.connection.rollback()
+
+ if not mute_exceptions:
+ config.logger.exception(f"UNSTABLE: {sql}[{parameters}]", ddb)
except psycopg2.errors.ConfigurationLimitExceeded as cle:
cur.connection.rollback()
- config.logger.exception(sql, cle)
+
+ if not mute_exceptions:
+ config.logger.exception(f"UNSTABLE: {sql}[{parameters}]", cle)
+ if force_warning:
+ config.has_warnings = True
+ else:
+ config.has_failures = True
+
+ if config.exit_on_fail:
+ exit(1)
except psycopg2.OperationalError as oe:
cur.connection.rollback()
- config.logger.exception(sql, oe)
+
+ if not mute_exceptions:
+ config.logger.exception(f"UNSTABLE: {sql}[{parameters}]", oe)
+ if force_warning:
+ config.has_warnings = True
+ else:
+ config.has_failures = True
+
+ if config.exit_on_fail:
+ exit(1)
except Exception as e:
cur.connection.rollback()
- config.logger.exception(sql_wo_parameters, e)
+
+ if not mute_exceptions:
+ config.logger.exception(f"UNSTABLE: {sql}[{parameters}]", e)
+ if force_warning:
+ config.has_warnings = True
+ else:
+ config.has_failures = True
+
+ if config.exit_on_fail:
+ exit(1)
+
raise e
else:
try:
+ config.logger.debug(f"SQL >> {sql_wo_parameters}")
cur.execute(sql_wo_parameters)
except psycopg2.errors.QueryCanceled as e:
cur.connection.rollback()
+
+ if not mute_exceptions:
+ config.logger.debug(f"UNSTABLE: {sql_wo_parameters}", sql_wo_parameters)
raise e
+ except psycopg2.errors.DuplicateDatabase as ddb:
+ cur.connection.rollback()
+
+ if not mute_exceptions:
+ config.logger.exception(f"UNSTABLE: {sql_wo_parameters}", ddb)
except psycopg2.errors.ConfigurationLimitExceeded as cle:
cur.connection.rollback()
- config.logger.exception(sql, cle)
+
+ if not mute_exceptions:
+ config.logger.exception(f"UNSTABLE: {sql_wo_parameters}", cle)
+ if force_warning:
+ config.has_warnings = True
+ else:
+ config.has_failures = True
+
+ if config.exit_on_fail:
+ exit(1)
except psycopg2.OperationalError as oe:
cur.connection.rollback()
- config.logger.exception(sql, oe)
+
+ if not mute_exceptions:
+ config.logger.exception(f"UNSTABLE: {sql_wo_parameters}", oe)
+ if force_warning:
+ config.has_warnings = True
+ else:
+ config.has_failures = True
+
+ if config.exit_on_fail:
+ exit(1)
except Exception as e:
cur.connection.rollback()
- config.logger.exception(sql_wo_parameters, e)
+
+ if not mute_exceptions:
+ config.logger.exception(f"UNSTABLE: {sql_wo_parameters}", e)
+ if force_warning:
+ config.has_warnings = True
+ else:
+ config.has_failures = True
+
+ if config.exit_on_fail:
+ exit(1)
+
raise e
return parameters
@@ -269,12 +518,31 @@ def allowed_diff(config, original_execution_time, optimization_execution_time):
def get_md5(string: str):
- return str(hashlib.md5(string.encode('utf-8')).hexdigest())
+ return hashlib.md5(string.encode('utf-8')).hexdigest()
-def get_bool_from_str(string: str):
+def get_bool_from_object(string: str | bool | int):
return string in {True, 1, "True", "true", "TRUE", "T"}
+def get_model_path(model):
+ if model.startswith("/") or model.startswith("."):
+ return model
+ else:
+ return f"sql/{model}"
+
+
def disabled_path(query):
return query.execution_plan.get_estimated_cost() < 10000000000
+
+
+def get_plan_diff(baseline, changed):
+ return "\n".join(
+ text for text in difflib.unified_diff(baseline.split("\n"), changed.split("\n")) if
+ text[:3] not in ('+++', '---', '@@ '))
+
+
+def seconds_to_readable_minutes(seconds):
+ minutes = seconds // 60
+ remaining_seconds = seconds % 60
+ return f"{minutes} minute{'s' if minutes != 1 else ''} and {remaining_seconds:.2f} second{'s' if remaining_seconds != 1 else ''}"