Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix query on unspecified property #191

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 150 additions & 4 deletions src/core/functions/table/match.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <duckpgq_extension.hpp>
#include "duckpgq/core/functions/table/match.hpp"

#include "duckdb/common/string_util.hpp"
#include "duckdb/common/case_insensitive_map.hpp"
#include "duckdb/parser/tableref/matchref.hpp"
#include "duckdb/parser/tableref/subqueryref.hpp"
#include "duckdb/parser/tableref/joinref.hpp"
Expand Down Expand Up @@ -33,6 +35,55 @@ namespace duckpgq {

namespace core {

namespace {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is there a namespace here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The anonymous namespace (or static) is to constrain symbol to live only in the current translation unit.
Reference: https://google.github.io/styleguide/cppguide.html#Internal_Linkage

I don't have a strong preference for unamed namespace vs static; use the former here since it's used in my company.


// Get fully-qualified column names for the property graph [tbl], and insert
// into set [col_names].
void PopulateFullyQualifiedColName(
const vector<shared_ptr<PropertyGraphTable>> &tbls,
const case_insensitive_map_t<vector<string>> &tbl_name_to_aliases,
case_insensitive_set_t &col_names) {
for (const auto &cur_tbl : tbls) {
for (const auto &cur_col : cur_tbl->column_names) {
// It's legal to query by `<col>` instead of `<table>.<col>`.
col_names.insert(cur_col);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if this will also be sufficient for property graphs on attached databases. Then we also need to add the catalog and schema ("catalog"."schema"."table"."column").
I would like to see a test case handle this. For examples of attached databases, see https://github.com/cwida/duckpgq-extension/blob/9016214bbbbfd4e0f0fd5f4d9dfa74c61ab9d385/test/sql/create_pg/attach_pg.test

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's a good point, and it's not hard to get the catalog and schema name since they're already part of PropertyGraphTable.

However after some trials, I found it's failing due to another bind error:
https://github.com/cwida/duckdb-pgq/blob/4d5ca72d02b0844dc0acc6e605db5100b6761a29/src/planner/bind_context.cpp#L240-L255

Checking the variable, only <table>.<column> is stored in the bindings here.


const string &tbl_name = cur_tbl->table_name;
auto iter = tbl_name_to_aliases.find(tbl_name);
// Prefer to use table alias specified in the statement, otherwise use
// table name.
if (iter == tbl_name_to_aliases.end()) {
col_names.insert(StringUtil::Format("%s.%s", tbl_name, cur_col));
} else {
const auto &all_aliases = iter->second;
for (const auto &cur_alias : all_aliases) {
col_names.insert(StringUtil::Format("%s.%s", cur_alias, cur_col));
}
}
}
}
}

// Get fully-qualified column names from property graph.
case_insensitive_set_t GetFullyQualifiedColFromPg(
const CreatePropertyGraphInfo &pg,
const case_insensitive_map_t<shared_ptr<PropertyGraphTable>> &alias_map) {
case_insensitive_map_t<vector<string>> relation_name_to_aliases;
for (const auto &entry : alias_map) {
relation_name_to_aliases[entry.second->table_name].emplace_back(
entry.first);
}

case_insensitive_set_t col_names;
PopulateFullyQualifiedColName(pg.vertex_tables, relation_name_to_aliases,
col_names);
PopulateFullyQualifiedColName(pg.edge_tables, relation_name_to_aliases,
col_names);
return col_names;
}

} // namespace

shared_ptr<PropertyGraphTable>
PGQMatchFunction::FindGraphTable(const string &label,
CreatePropertyGraphInfo &pg_table) {
Expand Down Expand Up @@ -145,6 +196,18 @@ PathElement *PGQMatchFunction::GetPathElement(
throw InternalException("Unknown path reference type detected");
}

SubPath *
PGQMatchFunction::GetSubPath(const unique_ptr<PathReference> &path_reference) {
if (path_reference->path_reference_type ==
PGQPathReferenceType::PATH_ELEMENT) {
return nullptr;
}
if (path_reference->path_reference_type == PGQPathReferenceType::SUBPATH) {
return reinterpret_cast<SubPath *>(path_reference.get());
}
throw InternalException("Unknown path reference type detected");
}

unique_ptr<SubqueryRef> PGQMatchFunction::CreateCountCTESubquery() {
//! BEGIN OF (SELECT count(cte1.temp) as temp * 0 from cte1) __x

Expand Down Expand Up @@ -907,15 +970,88 @@ void PGQMatchFunction::ProcessPathList(
}
}

void PGQMatchFunction::PopulateGraphTableAliasMap(
const CreatePropertyGraphInfo &pg_table,
const unique_ptr<PathReference> &path_reference,
case_insensitive_map_t<shared_ptr<PropertyGraphTable>>
&alias_to_vertex_and_edge_tables) {
PathElement *path_elem = GetPathElement(path_reference);

// Populate binding from PathElement.
if (path_elem != nullptr) {
auto iter = pg_table.label_map.find(path_elem->label);
if (iter == pg_table.label_map.end()) {
throw BinderException(
"The label %s is not registered in property graph %s",
path_elem->label, pg_table.property_graph_name);
}
alias_to_vertex_and_edge_tables[path_elem->variable_binding] = iter->second;
return;
}

// Recursively populate binding from SubPath.
SubPath *sub_path = GetSubPath(path_reference);
D_ASSERT(sub_path != nullptr);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if the result of sub_path is best checked with an assert. Instead with an if (!subpath) and then handling that case.

You already checked whether it is a PathElement earlier in the function PathElement *path_elem = GetPathElement(path_reference);. But I am not sure it is very intuitive for me.

Copy link
Contributor Author

@dentiny dentiny Jan 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it's a sanity check, because we only have two valid types: path element and subpath.
As you mentioned, we already handled path element case above;
the check here is to make sure path reference is not unknown.
Let me know your preference, happy to change :)

const auto &path_list = sub_path->path_list;
for (const auto &cur_path : path_list) {
PopulateGraphTableAliasMap(pg_table, cur_path,
alias_to_vertex_and_edge_tables);
}
}

void PGQMatchFunction::CheckColumnBinding(
const CreatePropertyGraphInfo &pg_table, const MatchExpression &ref) {
// Maps from table alias to table, including vertex and edge tables.
case_insensitive_map_t<shared_ptr<PropertyGraphTable>>
alias_to_vertex_and_edge_tables;
for (idx_t idx_i = 0; idx_i < ref.path_patterns.size(); idx_i++) {
const auto &path_list = ref.path_patterns[idx_i]->path_elements;
for (const auto &cur_path : path_list) {
PopulateGraphTableAliasMap(pg_table, cur_path,
alias_to_vertex_and_edge_tables);
}
}

// All fully-qualified column names for vertex tables and edge tables.
const auto all_fq_col_names =
GetFullyQualifiedColFromPg(pg_table, alias_to_vertex_and_edge_tables);

for (auto &expression : ref.column_list) {
// TODO(hjiang): `ColumnRefExpression` alone is not enough, we could have
// more complicated expression.
//
// See issue for reference:
// https://github.com/cwida/duckpgq-extension/issues/198
auto *column_ref = dynamic_cast<ColumnRefExpression *>(expression.get());
if (column_ref == nullptr) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might not be a sufficient check, since there may be something else other than a ColumnRefExpression such as a FunctionExpression. This FunctionExpression could then contain a non-registered property, which should be checked.

I would at least like to see a test case check this:

statement ok
create table person (name varchar, lastname varchar); insert into person values ('Alice', 'Foo');

statement ok 
-create property graph pg vertex tables (person properties (name));

query I
-from graph_table (pg match (p:person) columns (p.name || p.name as name));
----
AliceAlice

statement error
-from graph_table (pg match (p:person) columns(p.name, p.name || p.lastname)); 
----
BinderException: p.notregistered is not a registered property for label person in property graph pg

The exact error message does not need to be this of course.

This can also be implemented as another PR since this can become quite nested.

Copy link
Contributor Author

@dentiny dentiny Jan 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see what you mean, that makes sense; currently I don't see an easy solution.

I create another issue to track: #198, and leave a TODO under my name.

continue;
}
// 'shortest_path_cte' is a special table populated by pgq.
if (column_ref->column_names[0] == "shortest_path_cte") {
continue;
}
// 'rowid' is a column duckdb binds automatically.
if (column_ref->column_names.back() == "rowid") {
continue;
}
const auto cur_fq_col_name =
StringUtil::Join(column_ref->column_names, /*separator=*/".");
if (all_fq_col_names.find(cur_fq_col_name) == all_fq_col_names.end()) {
throw BinderException("Property %s is never registered!",
cur_fq_col_name);
}
}
}

unique_ptr<TableRef>
PGQMatchFunction::MatchBindReplace(ClientContext &context,
TableFunctionBindInput &bind_input) {
auto duckpgq_state = GetDuckPGQState(context);

auto match_index = bind_input.inputs[0].GetValue<int32_t>();
auto ref = dynamic_cast<MatchExpression *>(
auto *ref = dynamic_cast<MatchExpression *>(
duckpgq_state->transform_expression[match_index].get());
auto pg_table = duckpgq_state->GetPropertyGraph(ref->pg_name);
auto *pg_table = duckpgq_state->GetPropertyGraph(ref->pg_name);

vector<unique_ptr<ParsedExpression>> conditions;

Expand Down Expand Up @@ -948,11 +1084,16 @@ PGQMatchFunction::MatchBindReplace(ClientContext &context,
if (ref->where_clause) {
conditions.push_back(std::move(ref->where_clause));
}

CheckColumnBinding(*pg_table, *ref);
dentiny marked this conversation as resolved.
Show resolved Hide resolved

std::vector<unique_ptr<ParsedExpression>> final_column_list;

for (auto &expression : ref->column_list) {
unordered_set<string> named_subpaths;
auto column_ref = dynamic_cast<ColumnRefExpression *>(expression.get());

// Handle ColumnRefExpression.
auto *column_ref = dynamic_cast<ColumnRefExpression *>(expression.get());
if (column_ref != nullptr) {
if (named_subpaths.count(column_ref->column_names[0]) &&
column_ref->column_names.size() == 1) {
Expand All @@ -963,7 +1104,9 @@ PGQMatchFunction::MatchBindReplace(ClientContext &context,
}
continue;
}
auto function_ref = dynamic_cast<FunctionExpression *>(expression.get());

// Handle FunctionExpression.
auto *function_ref = dynamic_cast<FunctionExpression *>(expression.get());
if (function_ref != nullptr) {
if (function_ref->function_name == "path_length") {
column_ref = dynamic_cast<ColumnRefExpression *>(
Expand Down Expand Up @@ -995,6 +1138,9 @@ PGQMatchFunction::MatchBindReplace(ClientContext &context,
continue;
}

// TODO(hjiang): For star expression, only select columns in vertex or edge
// table, but not those unspecified in property graph.
// Issue reference: https://github.com/cwida/duckpgq-extension/issues/192
final_column_list.push_back(std::move(expression));
}

Expand Down
16 changes: 16 additions & 0 deletions src/include/duckpgq/core/functions/table/match.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,20 @@ struct PGQMatchFunction : public TableFunction {
vector<string> vertex_keys, vector<string> edge_keys,
const string &vertex_alias, const string &edge_alias);

// Populate all vertex and edge tables and their alias into
// [alias_to_vertex_and_edge_tables], from paths information from
// [path_reference].
static void PopulateGraphTableAliasMap(
const CreatePropertyGraphInfo &pg_table,
const unique_ptr<PathReference> &path_reference,
case_insensitive_map_t<shared_ptr<PropertyGraphTable>>
&alias_to_vertex_and_edge_tables);

static PathElement *
GetPathElement(const unique_ptr<PathReference> &path_reference);

static SubPath *GetSubPath(const unique_ptr<PathReference> &path_reference);

static unique_ptr<JoinRef>
GetJoinRef(const shared_ptr<PropertyGraphTable> &edge_table,
const string &edge_binding, const string &prev_binding,
Expand Down Expand Up @@ -157,6 +168,11 @@ struct PGQMatchFunction : public TableFunction {
CreatePropertyGraphInfo &pg_table,
unique_ptr<SelectNode> &final_select_node,
vector<unique_ptr<ParsedExpression>> &conditions);

// Check whether columns to query are valid against the property graph, throws
// BinderException if error.
static void CheckColumnBinding(const CreatePropertyGraphInfo &pg_table,
const MatchExpression &ref);
};

} // namespace core
Expand Down
9 changes: 9 additions & 0 deletions test/sql/create_pg/no_properties.test
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,12 @@ EDGE TABLES (
DESTINATION KEY ( dst ) REFERENCES Student ( id )
LABEL Knows
)

statement ok
-CREATE PROPERTY GRAPH g VERTEX TABLES (Student PROPERTIES (id));

# Query on unspecified columns in the graph throws error.
statement error
-SELECT * FROM GRAPH_TABLE (g MATCH (s:Student) COLUMNS (s.id, s.name));
----
Binder Error: Property s.name is never registered!
2 changes: 1 addition & 1 deletion test/sql/path_finding/complex_matching.test
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ statement error
) tmp
limit 10;
----
Binder Error: Referenced column "o" not found in FROM clause!
Binder Error: Property o is never registered!


# https://github.com/cwida/duckpgq-extension/issues/68
Expand Down
2 changes: 1 addition & 1 deletion test/sql/path_finding/shortest_path.test
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ statement error
COLUMNS (p, a.name as name, b.name as b_name)
) study;
----
Binder Error: Referenced column "p" not found in FROM clause!
Binder Error: Property p is never registered


query III
Expand Down
Loading