diff --git a/src/brain/index_selection.cpp b/src/brain/index_selection.cpp new file mode 100644 index 00000000000..cbaf0c516e8 --- /dev/null +++ b/src/brain/index_selection.cpp @@ -0,0 +1,488 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// index_selection.cpp +// +// Identification: src/brain/index_selection.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "brain/index_selection.h" +#include "brain/what_if_index.h" + +namespace peloton { +namespace brain { + +IndexSelection::IndexSelection(Workload &query_set, IndexSelectionKnobs knobs, + concurrency::TransactionContext *txn) + : query_set_(query_set), context_(knobs), txn_(txn) {} + +void IndexSelection::GetBestIndexes(IndexConfiguration &final_indexes) { + // http://www.vldb.org/conf/1997/P146.PDF + // Figure 4 of the "Index Selection Tool" paper. + // Split the workload 'W' into small workloads 'Wi', with each + // containing one query, and find out the candidate indexes + // for these 'Wi' + // Finally, combine all the candidate indexes 'Ci' into a larger + // set to form a candidate set 'C' for the provided workload 'W'. + + // The best indexes after every iteration + IndexConfiguration candidate_indexes; + // Single column indexes that are useful for at least one query + IndexConfiguration admissible_indexes; + + // Start the index selection. + for (unsigned long i = 0; i < context_.knobs_.num_iterations_; i++) { + LOG_DEBUG("******* Iteration %ld **********", i); + LOG_DEBUG("Candidate Indexes Before: %s", + candidate_indexes.ToString().c_str()); + GenerateCandidateIndexes(candidate_indexes, admissible_indexes, query_set_); + LOG_DEBUG("Admissible Indexes: %s", admissible_indexes.ToString().c_str()); + LOG_DEBUG("Candidate Indexes After: %s", + candidate_indexes.ToString().c_str()); + + // Configuration Enumeration + IndexConfiguration top_candidate_indexes; + Enumerate(candidate_indexes, top_candidate_indexes, query_set_, + context_.knobs_.num_indexes_); + LOG_DEBUG("Top Candidate Indexes: %s", + candidate_indexes.ToString().c_str()); + + candidate_indexes = top_candidate_indexes; + + // Generate multi-column indexes before starting the next iteration. + // Only do this if there is next iteration. + if (i < (context_.knobs_.num_iterations_ - 1)) { + GenerateMultiColumnIndexes(top_candidate_indexes, admissible_indexes, + candidate_indexes); + } + } + + final_indexes = candidate_indexes; +} + +void IndexSelection::GenerateCandidateIndexes( + IndexConfiguration &candidate_config, IndexConfiguration &admissible_config, + Workload &workload) { + // If there are no admissible indexes, then this is the first iteration. + // Candidate indexes will be a union of admissible index set of each query. + if (admissible_config.IsEmpty() && candidate_config.IsEmpty()) { + for (auto query : workload.GetQueries()) { + Workload wi(query, workload.GetDatabaseName()); + + IndexConfiguration ai; + GetAdmissibleIndexes(query.first, ai); + admissible_config.Merge(ai); + + IndexConfiguration pruned_ai; + PruneUselessIndexes(ai, wi, pruned_ai); + // Candidate config for the single-column indexes is the union of + // candidates for each query. + candidate_config.Merge(pruned_ai); + } + LOG_TRACE("Single column candidate indexes: %lu", + candidate_config.GetIndexCount()); + } else { + LOG_TRACE("Pruning multi-column indexes"); + IndexConfiguration pruned_ai; + PruneUselessIndexes(candidate_config, workload, pruned_ai); + candidate_config.Set(pruned_ai); + } +} + +void IndexSelection::PruneUselessIndexes(IndexConfiguration &config, + Workload &workload, + IndexConfiguration &pruned_config) { + IndexConfiguration empty_config; + auto indexes = config.GetIndexes(); + + for (auto it = indexes.begin(); it != indexes.end(); it++) { + bool is_useful = false; + + for (auto query : workload.GetQueries()) { + IndexConfiguration c; + c.AddIndexObject(*it); + + Workload w(query, workload.GetDatabaseName()); + + auto c1 = ComputeCost(c, w); + auto c2 = ComputeCost(empty_config, w); + LOG_TRACE("Cost with index %s is %lf", c.ToString().c_str(), c1); + LOG_TRACE("Cost without is %lf", c2); + + if (c1 < c2) { + is_useful = true; + break; + } + } + // Index is useful if it benefits any query. + if (is_useful) { + pruned_config.AddIndexObject(*it); + } + } +} + +void IndexSelection::Enumerate(IndexConfiguration &indexes, + IndexConfiguration &top_indexes, + Workload &workload, size_t num_indexes) { + // Get the cheapest indexes through exhaustive search upto a threshold + ExhaustiveEnumeration(indexes, top_indexes, workload); + + // Get all the remaining indexes which can be part of our optimal set + auto remaining_indexes = indexes - top_indexes; + + // Greedily add the remaining indexes until there is no improvement in the + // cost or our required size is reached + GreedySearch(top_indexes, remaining_indexes, workload, num_indexes); +} + +void IndexSelection::GreedySearch(IndexConfiguration &indexes, + IndexConfiguration &remaining_indexes, + Workload &workload, size_t k) { + // Algorithm: + // 1. Let S = the best m index configuration using the naive enumeration + // algorithm. If m = k then exit. + // 2. Pick a new index I such that Cost (S U {I}, W) <= Cost(S U {I'}, W) for + // any choice of I' != I + // 3. If Cost (S U {I}) >= Cost(S) then exit + // Else S = S U {I} + // 4. If |S| = k then exit + LOG_TRACE("GREEDY: Starting with the following index: %s", + indexes.ToString().c_str()); + size_t current_index_count = indexes.GetIndexCount(); + + LOG_TRACE("GREEDY: At start: #indexes chosen : %zu, #num_indexes: %zu", + current_index_count, k); + + if (current_index_count >= k) return; + + double global_min_cost = ComputeCost(indexes, workload); + double cur_min_cost = global_min_cost; + double cur_cost; + std::shared_ptr best_index; + + // go through till you get top k indexes + while (current_index_count < k) { + // this is the set S so far + auto new_indexes = indexes; + for (auto const &index : remaining_indexes.GetIndexes()) { + new_indexes = indexes; + new_indexes.AddIndexObject(index); + cur_cost = ComputeCost(new_indexes, workload); + LOG_TRACE("GREEDY: Considering this index: %s \n with cost: %lf", + index->ToString().c_str(), cur_cost); + if (cur_cost < cur_min_cost || + (best_index != nullptr && cur_cost == cur_min_cost && + new_indexes.ToString() < best_index->ToString())) { + cur_min_cost = cur_cost; + best_index = index; + } + } + + // if we found a better configuration + if (cur_min_cost < global_min_cost) { + LOG_TRACE("GREEDY: Adding the following index: %s", + best_index->ToString().c_str()); + indexes.AddIndexObject(best_index); + remaining_indexes.RemoveIndexObject(best_index); + current_index_count++; + global_min_cost = cur_min_cost; + + // we are done with all remaining indexes + if (remaining_indexes.GetIndexCount() == 0) { + LOG_TRACE("GREEDY: Breaking because nothing more"); + break; + } + } else { // we did not find any better index to add to our current + // configuration + LOG_TRACE("GREEDY: Breaking because nothing better found"); + break; + } + } +} + +void IndexSelection::ExhaustiveEnumeration(IndexConfiguration &indexes, + IndexConfiguration &top_indexes, + Workload &workload) { + // Get the best m index configurations using the naive enumeration algorithm + // The naive algorithm gets all the possible subsets of size <= m and then + // returns the cheapest m indexes + + auto max_num_indexes = std::min(context_.knobs_.naive_enumeration_threshold_, + context_.knobs_.num_indexes_); + + // Define a set ordering of (index config, cost) and define the ordering in + // the set + std::set, IndexConfigComparator> + running_index_config(workload), temp_index_config(workload), + result_index_config(workload); + + IndexConfiguration new_element; + + // Add an empty configuration as initialization + IndexConfiguration empty; + // The running index configuration contains the possible subsets generated so + // far. It is updated after every iteration + auto cost_empty = ComputeCost(empty, workload); + running_index_config.emplace(empty, cost_empty); + + for (auto const &index : indexes.GetIndexes()) { + // Make a copy of the running index configuration and add each element to it + temp_index_config = running_index_config; + + for (auto t : temp_index_config) { + new_element = t.first; + new_element.AddIndexObject(index); + + // If the size of the subset reaches our threshold, add to result set + // instead of adding to the running list + if (new_element.GetIndexCount() >= max_num_indexes) { + result_index_config.emplace(new_element, + ComputeCost(new_element, workload)); + } else { + running_index_config.emplace(new_element, + ComputeCost(new_element, workload)); + } + } + } + + // Put all the subsets in the result set + result_index_config.insert(running_index_config.begin(), + running_index_config.end()); + // Remove the starting empty set that we added + result_index_config.erase({empty, cost_empty}); + + for (auto index : result_index_config) { + LOG_TRACE("EXHAUSTIVE: Index: %s, Cost: %lf", + index.first.ToString().c_str(), index.second); + } + + // Since the insertion into the sets ensures the order of cost, get the first + // m configurations + if (result_index_config.empty()) return; + + // if having no indexes is better (for eg. for insert heavy workload), + // then don't choose anything + if (cost_empty < result_index_config.begin()->second) return; + + auto best_m_index = result_index_config.begin()->first; + top_indexes.Merge(best_m_index); +} + +void IndexSelection::GetAdmissibleIndexes( + std::shared_ptr query, IndexConfiguration &indexes) { + // Find out the indexable columns of the given workload. + // The following rules define what indexable columns are: + // 1. A column that appears in the WHERE clause with format + // ==> Column OP Expr <== + // OP such as {=, <, >, <=, >=, LIKE, etc.} + // Column is a table column name. + // 2. GROUP BY (if present) + // 3. ORDER BY (if present) + // 4. all updated columns for UPDATE query. + switch (query->GetType()) { + case StatementType::INSERT: { + auto insert_stmt = dynamic_cast(query.get()); + // If the insert is along with a select statement, i.e another table's + // select output is fed into this table. + if (insert_stmt->select != nullptr) { + IndexColsParseWhereHelper(insert_stmt->select->where_clause.get(), + indexes); + } + break; + } + + case StatementType::DELETE: { + auto delete_stmt = dynamic_cast(query.get()); + IndexColsParseWhereHelper(delete_stmt->expr.get(), indexes); + break; + } + + case StatementType::UPDATE: { + auto update_stmt = dynamic_cast(query.get()); + IndexColsParseWhereHelper(update_stmt->where.get(), indexes); + break; + } + + case StatementType::SELECT: { + auto select_stmt = dynamic_cast(query.get()); + IndexColsParseWhereHelper(select_stmt->where_clause.get(), indexes); + IndexColsParseOrderByHelper(select_stmt->order, indexes); + IndexColsParseGroupByHelper(select_stmt->group_by, indexes); + break; + } + + default: { LOG_DEBUG("DDL Statement encountered, Ignoring.."); } + } +} + +void IndexSelection::IndexColsParseWhereHelper( + const expression::AbstractExpression *where_expr, + IndexConfiguration &config) { + if (where_expr == nullptr) { + LOG_DEBUG("No Where Clause Found"); + return; + } + auto expr_type = where_expr->GetExpressionType(); + const expression::AbstractExpression *left_child; + const expression::AbstractExpression *right_child; + const expression::TupleValueExpression *tuple_child; + + switch (expr_type) { + case ExpressionType::COMPARE_EQUAL: + case ExpressionType::COMPARE_NOTEQUAL: + case ExpressionType::COMPARE_GREATERTHAN: + case ExpressionType::COMPARE_GREATERTHANOREQUALTO: + case ExpressionType::COMPARE_LESSTHAN: + case ExpressionType::COMPARE_LESSTHANOREQUALTO: + case ExpressionType::COMPARE_LIKE: + case ExpressionType::COMPARE_NOTLIKE: + case ExpressionType::COMPARE_IN: + // Get left and right child and extract the column name. + left_child = where_expr->GetChild(0); + right_child = where_expr->GetChild(1); + + // if where clause is something like a = b, we don't benefit from index + if (left_child->GetExpressionType() == ExpressionType::VALUE_TUPLE && + right_child->GetExpressionType() == ExpressionType::VALUE_TUPLE) { + return; + } + + // if where clause is something like 1 = 2, we don't benefit from index + if (left_child->GetExpressionType() == ExpressionType::VALUE_CONSTANT && + right_child->GetExpressionType() == ExpressionType::VALUE_CONSTANT) { + return; + } + + if (left_child->GetExpressionType() == ExpressionType::VALUE_TUPLE) { + PELOTON_ASSERT(right_child->GetExpressionType() != + ExpressionType::VALUE_TUPLE); + tuple_child = + dynamic_cast(left_child); + } else { + PELOTON_ASSERT(right_child->GetExpressionType() == + ExpressionType::VALUE_TUPLE); + tuple_child = + dynamic_cast(right_child); + } + + if (!tuple_child->GetIsBound()) { + LOG_ERROR("Query is not bound"); + PELOTON_ASSERT(false); + } + IndexObjectPoolInsertHelper(tuple_child->GetBoundOid(), config); + + break; + case ExpressionType::CONJUNCTION_AND: + case ExpressionType::CONJUNCTION_OR: + left_child = where_expr->GetChild(0); + right_child = where_expr->GetChild(1); + IndexColsParseWhereHelper(left_child, config); + IndexColsParseWhereHelper(right_child, config); + break; + default: + LOG_ERROR("Index selection doesn't allow %s in where clause", + where_expr->GetInfo().c_str()); + PELOTON_ASSERT(false); + } +} + +void IndexSelection::IndexColsParseGroupByHelper( + std::unique_ptr &group_expr, + IndexConfiguration &config) { + if ((group_expr == nullptr) || (group_expr->columns.size() == 0)) { + LOG_DEBUG("Group by expression not present"); + return; + } + auto &columns = group_expr->columns; + for (auto it = columns.begin(); it != columns.end(); it++) { + PELOTON_ASSERT((*it)->GetExpressionType() == ExpressionType::VALUE_TUPLE); + auto tuple_value = (expression::TupleValueExpression *)((*it).get()); + IndexObjectPoolInsertHelper(tuple_value->GetBoundOid(), config); + } +} + +void IndexSelection::IndexColsParseOrderByHelper( + std::unique_ptr &order_expr, + IndexConfiguration &config) { + if ((order_expr == nullptr) || (order_expr->exprs.size() == 0)) { + LOG_DEBUG("Order by expression not present"); + return; + } + auto &exprs = order_expr->exprs; + for (auto it = exprs.begin(); it != exprs.end(); it++) { + PELOTON_ASSERT((*it)->GetExpressionType() == ExpressionType::VALUE_TUPLE); + auto tuple_value = (expression::TupleValueExpression *)((*it).get()); + IndexObjectPoolInsertHelper(tuple_value->GetBoundOid(), config); + } +} + +void IndexSelection::IndexObjectPoolInsertHelper( + const std::tuple &tuple_oid, + IndexConfiguration &config) { + auto db_oid = std::get<0>(tuple_oid); + auto table_oid = std::get<1>(tuple_oid); + auto col_oid = std::get<2>(tuple_oid); + + // Add the object to the pool. + HypotheticalIndexObject iobj(db_oid, table_oid, col_oid); + auto pool_index_obj = context_.pool_.GetIndexObject(iobj); + if (!pool_index_obj) { + pool_index_obj = context_.pool_.PutIndexObject(iobj); + } + config.AddIndexObject(pool_index_obj); +} + +double IndexSelection::ComputeCost(IndexConfiguration &config, + Workload &workload) { + double cost = 0.0; + auto queries = workload.GetQueries(); + for (auto query : queries) { + std::pair state = { + config, query.first.get()}; + if (context_.memo_.find(state) != context_.memo_.end()) { + cost += context_.memo_[state]; + } else { + auto result = WhatIfIndex::GetCostAndBestPlanTree( + query, config, workload.GetDatabaseName(), txn_); + context_.memo_[state] = result->cost; + cost += result->cost; + } + } + return cost; +} + +void IndexSelection::CrossProduct( + const IndexConfiguration &config, + const IndexConfiguration &single_column_indexes, + IndexConfiguration &result) { + auto indexes = config.GetIndexes(); + auto columns = single_column_indexes.GetIndexes(); + for (auto index : indexes) { + for (auto column : columns) { + if (!index->IsCompatible(column)) continue; + auto merged_index = (index->Merge(column)); + result.AddIndexObject(context_.pool_.PutIndexObject(merged_index)); + } + } +} + +void IndexSelection::GenerateMultiColumnIndexes( + IndexConfiguration &config, IndexConfiguration &single_column_indexes, + IndexConfiguration &result) { + CrossProduct(config, single_column_indexes, result); +} + +std::shared_ptr IndexSelection::AddConfigurationToPool( + HypotheticalIndexObject object) { + return context_.pool_.PutIndexObject(object); +} + +} // namespace brain +} // namespace peloton diff --git a/src/brain/index_selection_context.cpp b/src/brain/index_selection_context.cpp new file mode 100644 index 00000000000..3933b72c844 --- /dev/null +++ b/src/brain/index_selection_context.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// index_selection_context.cpp +// +// Identification: src/brain/index_selection_context.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/index_selection_context.h" +#include "common/logger.h" + +namespace peloton { +namespace brain { + +IndexSelectionContext::IndexSelectionContext(IndexSelectionKnobs knobs) + : knobs_(knobs) {} + +} // namespace brain +} // namespace peloton diff --git a/src/brain/index_selection_job.cpp b/src/brain/index_selection_job.cpp new file mode 100644 index 00000000000..b1c739e1969 --- /dev/null +++ b/src/brain/index_selection_job.cpp @@ -0,0 +1,189 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// index_selection_job.cpp +// +// Identification: src/brain/index_selection_job.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/index_selection_util.h" +#include "brain/index_selection_job.h" +#include "brain/index_selection.h" +#include "catalog/query_history_catalog.h" +#include "catalog/system_catalogs.h" +#include "optimizer/stats/stats_storage.h" + +namespace peloton { +namespace brain { + +void IndexSelectionJob::OnJobInvocation(BrainEnvironment *env) { + LOG_INFO("Started Index Suggestion Task"); + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Analyze stats for all the tables. + // TODO: AnalyzeStatsForAllTables crashes sometimes. + optimizer::StatsStorage *stats_storage = + optimizer::StatsStorage::GetInstance(); + ResultType stats_result = stats_storage->AnalyzeStatsForAllTables(txn); + if (stats_result != ResultType::SUCCESS) { + LOG_ERROR( + "Cannot generate stats for table columns. Not performing index " + "suggestion..."); + txn_manager.AbortTransaction(txn); + return; + } + + // Query the catalog for new SQL queries. + // New SQL queries are the queries that were added to the system + // after the last_timestamp_ + auto &query_catalog = catalog::QueryHistoryCatalog::GetInstance(txn); + auto query_history = + query_catalog.GetQueryStringsAfterTimestamp(last_timestamp_, txn); + if (query_history->size() > num_queries_threshold_) { + LOG_INFO("Tuning threshold has crossed. Time to tune the DB!"); + + // Run the index selection. + std::vector queries; + for (auto query_pair : *query_history) { + queries.push_back(query_pair.second); + } + + // TODO: Handle multiple databases + brain::Workload workload(queries, DEFAULT_DB_NAME, txn); + LOG_INFO("Knob: Num Indexes: %zu", + env->GetIndexSelectionKnobs().num_indexes_); + LOG_INFO("Knob: Naive: %zu", + env->GetIndexSelectionKnobs().naive_enumeration_threshold_); + LOG_INFO("Knob: Num Iterations: %zu", + env->GetIndexSelectionKnobs().num_iterations_); + brain::IndexSelection is = {workload, env->GetIndexSelectionKnobs(), txn}; + brain::IndexConfiguration best_config; + is.GetBestIndexes(best_config); + + if (best_config.IsEmpty()) { + LOG_INFO("Best config is empty. No new indexes this time..."); + } + + // Get the index objects from database. + auto database_object = catalog::Catalog::GetInstance()->GetDatabaseObject( + DEFAULT_DB_NAME, txn); + auto pg_index = catalog::Catalog::GetInstance() + ->GetSystemCatalogs(database_object->GetDatabaseOid()) + ->GetIndexCatalog(); + auto cur_indexes = pg_index->GetIndexObjects(txn); + auto drop_indexes = GetIndexesToDrop(cur_indexes, best_config); + + // Drop useless indexes. + for (auto index : drop_indexes) { + LOG_DEBUG("Dropping Index: %s", index->GetIndexName().c_str()); + DropIndexRPC(database_object->GetDatabaseOid(), index.get()); + } + + // Create new indexes. + for (auto index : best_config.GetIndexes()) { + CreateIndexRPC(index.get()); + } + + last_timestamp_ = GetLatestQueryTimestamp(query_history.get()); + } else { + LOG_INFO("Index Suggestion - not performing this time"); + } + txn_manager.CommitTransaction(txn); +} + +std::vector> +IndexSelectionJob::GetIndexesToDrop( + std::unordered_map> + &index_objects, + brain::IndexConfiguration best_config) { + std::vector> ret_indexes; + // Get the existing indexes and drop them. + for (auto index : index_objects) { + auto index_name = index.second->GetIndexName(); + // TODO [vamshi]: REMOVE THIS IN THE FINAL CODE + // This is a hack for now. Add a boolean to the index catalog to + // find out if an index is a brain suggested index/user created index. + if (index_name.find(brain_suggested_index_prefix_str) != + std::string::npos) { + bool found = false; + for (auto installed_index : best_config.GetIndexes()) { + if ((index.second.get()->GetTableOid() == + installed_index.get()->table_oid) && + (index.second.get()->GetKeyAttrs() == + installed_index.get()->column_oids)) { + found = true; + } + } + // Drop only indexes which are not suggested this time. + if (!found) { + ret_indexes.push_back(index.second); + } + } + } + return ret_indexes; +} + +void IndexSelectionJob::CreateIndexRPC(brain::HypotheticalIndexObject *index) { + // TODO: Remove hardcoded database name and server end point. + capnp::EzRpcClient client("localhost:15445"); + PelotonService::Client peloton_service = client.getMain(); + + // Create the index name: concat - db_id, table_id, col_ids + std::stringstream sstream; + sstream << brain_suggested_index_prefix_str << "_" << index->db_oid << "_" + << index->table_oid << "_"; + std::vector col_oid_vector; + for (auto col : index->column_oids) { + col_oid_vector.push_back(col); + sstream << col << "_"; + } + auto index_name = sstream.str(); + + auto request = peloton_service.createIndexRequest(); + request.getRequest().setDatabaseOid(index->db_oid); + request.getRequest().setTableOid(index->table_oid); + request.getRequest().setIndexName(index_name); + request.getRequest().setUniqueKeys(false); + + auto col_list = + request.getRequest().initKeyAttrOids(index->column_oids.size()); + for (auto i = 0UL; i < index->column_oids.size(); i++) { + col_list.set(i, index->column_oids[i]); + } + + PELOTON_ASSERT(index->column_oids.size() > 0); + auto response = request.send().wait(client.getWaitScope()); +} + +void IndexSelectionJob::DropIndexRPC(oid_t database_oid, + catalog::IndexCatalogObject *index) { + // TODO: Remove hardcoded database name and server end point. + // TODO: Have to be removed when merged with tli's code. + capnp::EzRpcClient client("localhost:15445"); + PelotonService::Client peloton_service = client.getMain(); + + auto request = peloton_service.dropIndexRequest(); + request.getRequest().setDatabaseOid(database_oid); + request.getRequest().setIndexOid(index->GetIndexOid()); + + auto response = request.send().wait(client.getWaitScope()); +} + +uint64_t IndexSelectionJob::GetLatestQueryTimestamp( + std::vector> *queries) { + uint64_t latest_time = 0; + for (auto query : *queries) { + if (query.first > latest_time) { + latest_time = query.first; + } + } + return latest_time; +} +} +} diff --git a/src/brain/index_selection_util.cpp b/src/brain/index_selection_util.cpp new file mode 100644 index 00000000000..4ebeda9d2f1 --- /dev/null +++ b/src/brain/index_selection_util.cpp @@ -0,0 +1,294 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// index_selection_util.cpp +// +// Identification: src/brain/index_selection_util.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/index_selection_util.h" +#include "common/logger.h" + +namespace peloton { +namespace brain { + +//===--------------------------------------------------------------------===// +// IndexObject +//===--------------------------------------------------------------------===// + +const std::string HypotheticalIndexObject::ToString() const { + std::stringstream str_stream; + str_stream << "Database: " << db_oid << "\n"; + str_stream << "Table: " << table_oid << "\n"; + str_stream << "Columns: "; + for (auto col : column_oids) { + str_stream << col << ", "; + } + str_stream << "\n"; + return str_stream.str(); +} + +bool HypotheticalIndexObject::operator==( + const HypotheticalIndexObject &obj) const { + return (db_oid == obj.db_oid && table_oid == obj.table_oid && + column_oids == obj.column_oids); +} + +bool HypotheticalIndexObject::IsCompatible( + std::shared_ptr index) const { + return (db_oid == index->db_oid) && (table_oid == index->table_oid); +} + +HypotheticalIndexObject HypotheticalIndexObject::Merge( + std::shared_ptr index) { + HypotheticalIndexObject result; + result.db_oid = db_oid; + result.table_oid = table_oid; + result.column_oids = column_oids; + for (auto column : index->column_oids) { + if (std::find(column_oids.begin(), column_oids.end(), column) == + column_oids.end()) + result.column_oids.push_back(column); + } + return result; +} + +//===--------------------------------------------------------------------===// +// IndexConfiguration +//===--------------------------------------------------------------------===// + +void IndexConfiguration::Merge(IndexConfiguration &config) { + auto indexes = config.GetIndexes(); + for (auto it = indexes.begin(); it != indexes.end(); it++) { + indexes_.insert(*it); + } +} + +void IndexConfiguration::Set(IndexConfiguration &config) { + indexes_.clear(); + auto indexes = config.GetIndexes(); + for (auto it = indexes.begin(); it != indexes.end(); it++) { + indexes_.insert(*it); + } +} + +void IndexConfiguration::RemoveIndexObject( + const std::shared_ptr &index_info) { + indexes_.erase(index_info); +} + +void IndexConfiguration::AddIndexObject( + const std::shared_ptr &index_info) { + indexes_.insert(index_info); +} + +size_t IndexConfiguration::GetIndexCount() const { return indexes_.size(); } + +bool IndexConfiguration::IsEmpty() const { return indexes_.empty(); } + +const std::set> + &IndexConfiguration::GetIndexes() const { + return indexes_; +} + +const std::string IndexConfiguration::ToString() const { + std::stringstream str_stream; + str_stream << "Num of indexes: " << GetIndexCount() << "\n"; + for (auto index : indexes_) { + str_stream << index->ToString() << " "; + } + return str_stream.str(); +} + +bool IndexConfiguration::operator==(const IndexConfiguration &config) const { + auto config_indexes = config.GetIndexes(); + return indexes_ == config_indexes; +} + +IndexConfiguration IndexConfiguration::operator-( + const IndexConfiguration &config) { + auto config_indexes = config.GetIndexes(); + + std::set> result; + std::set_difference(indexes_.begin(), indexes_.end(), config_indexes.begin(), + config_indexes.end(), + std::inserter(result, result.end())); + return IndexConfiguration(result); +} + +void IndexConfiguration::Clear() { indexes_.clear(); } + +//===--------------------------------------------------------------------===// +// IndexObjectPool +//===--------------------------------------------------------------------===// + +std::shared_ptr IndexObjectPool::GetIndexObject( + HypotheticalIndexObject &obj) { + auto ret = map_.find(obj); + if (ret != map_.end()) { + return ret->second; + } + return nullptr; +} + +std::shared_ptr IndexObjectPool::PutIndexObject( + HypotheticalIndexObject &obj) { + auto index_s_ptr = GetIndexObject(obj); + if (index_s_ptr != nullptr) return index_s_ptr; + HypotheticalIndexObject *index_copy = new HypotheticalIndexObject(); + *index_copy = obj; + index_s_ptr = std::shared_ptr(index_copy); + map_[*index_copy] = index_s_ptr; + return index_s_ptr; +} + +//===--------------------------------------------------------------------===// +// Workload +//===--------------------------------------------------------------------===// + +Workload::Workload(std::vector &queries, std::string database_name, + concurrency::TransactionContext *txn) + : database_name(database_name) { + LOG_TRACE("Initializing workload with %ld queries", queries.size()); + std::unique_ptr binder( + new binder::BindNodeVisitor(txn, database_name)); + + // Parse and bind every query. Store the results in the workload vector. + for (auto query : queries) { + LOG_DEBUG("Query: %s", query.c_str()); + + // Create a unique_ptr to free this pointer at the end of this loop + // iteration. + auto stmt_list = std::unique_ptr( + parser::PostgresParser::ParseSQLString(query)); + PELOTON_ASSERT(stmt_list->is_valid); + // TODO[vamshi]: Only one query for now. + PELOTON_ASSERT(stmt_list->GetNumStatements() == 1); + + // Create a new shared ptr from the unique ptr because + // these queries will be referenced by multiple objects later. + // Release the unique ptr from the stmt list to avoid freeing at the end + // of this loop iteration. + auto stmt = stmt_list->PassOutStatement(0); + auto stmt_shared = std::shared_ptr(stmt.release()); + PELOTON_ASSERT(stmt_shared->GetType() != StatementType::INVALID); + + try { + // Bind the query + binder->BindNameToNode(stmt_shared.get()); + } catch (Exception e) { + LOG_DEBUG("Cannot bind this query"); + continue; + } + + // Only take the DML queries from the workload + switch (stmt_shared->GetType()) { + case StatementType::INSERT: + case StatementType::DELETE: + case StatementType::UPDATE: + case StatementType::SELECT: { + // Get all the table names referenced in the query. + std::unordered_set tables_used; + Workload::GetTableNamesReferenced(stmt_shared, tables_used); + AddQuery(stmt_shared, tables_used); + } + default: + // Ignore other queries. + LOG_TRACE("Ignoring query: %s", stmt->GetInfo().c_str()); + } + } +} + +void Workload::GetTableNamesReferenced( + std::shared_ptr query, + std::unordered_set &table_names) { + // populated if this query has a cross-product table references. + std::vector> *table_cp_list; + + switch (query->GetType()) { + case StatementType::INSERT: { + auto sql_statement = dynamic_cast(query.get()); + table_names.insert(sql_statement->table_ref_->GetTableName()); + break; + } + + case StatementType::DELETE: { + auto sql_statement = dynamic_cast(query.get()); + table_names.insert(sql_statement->table_ref->GetTableName()); + break; + } + + case StatementType::UPDATE: { + auto sql_statement = dynamic_cast(query.get()); + table_names.insert(sql_statement->table->GetTableName()); + break; + } + + case StatementType::SELECT: { + auto sql_statement = dynamic_cast(query.get()); + // Select can operate on more than 1 table. + switch (sql_statement->from_table->type) { + case TableReferenceType::NAME: { + // Single table. + LOG_DEBUG("Table name is %s", + sql_statement->from_table.get()->GetTableName().c_str()); + table_names.insert(sql_statement->from_table.get()->GetTableName()); + break; + } + case TableReferenceType::JOIN: { + // Get all table names in the join. + std::deque queue; + queue.push_back(sql_statement->from_table->join->left.get()); + queue.push_back(sql_statement->from_table->join->right.get()); + while (queue.size() != 0) { + auto front = queue.front(); + queue.pop_front(); + if (front == nullptr) { + continue; + } + if (front->type == TableReferenceType::JOIN) { + queue.push_back(front->join->left.get()); + queue.push_back(front->join->right.get()); + } else if (front->type == TableReferenceType::NAME) { + table_names.insert(front->GetTableName()); + } else { + PELOTON_ASSERT(false); + } + } + break; + } + case TableReferenceType::SELECT: { + Workload::GetTableNamesReferenced( + std::shared_ptr( + sql_statement->from_table->select), + table_names); + break; + } + case TableReferenceType::CROSS_PRODUCT: { + // Cross product table list. + table_cp_list = &(sql_statement->from_table->list); + for (auto &table : *table_cp_list) { + table_names.insert(table->GetTableName()); + } + break; + } + case TableReferenceType::INVALID: { + LOG_ERROR("Invalid table reference"); + return; + } + } + break; + } + default: { + LOG_ERROR("Cannot handle DDL statements"); + PELOTON_ASSERT(false); + } + } +} + +} // namespace brain +} // namespace peloton diff --git a/src/brain/what_if_index.cpp b/src/brain/what_if_index.cpp new file mode 100644 index 00000000000..272a6f70997 --- /dev/null +++ b/src/brain/what_if_index.cpp @@ -0,0 +1,113 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// what_if_index.cpp +// +// Identification: src/brain/what_if_index.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/what_if_index.h" +#include "optimizer/operators.h" +#include "traffic_cop/traffic_cop.h" + +namespace peloton { +namespace brain { + +unsigned long WhatIfIndex::index_seq_no = 0; + +std::unique_ptr +WhatIfIndex::GetCostAndBestPlanTree(std::shared_ptr query, + IndexConfiguration &config, + std::string database_name, + concurrency::TransactionContext *txn) { + // Find all the tables that are referenced in the parsed query. + std::unordered_set tables_used; + Workload::GetTableNamesReferenced(query, tables_used); + return GetCostAndBestPlanTree(std::make_pair(query, tables_used), config, + database_name, txn); +} + +std::unique_ptr +WhatIfIndex::GetCostAndBestPlanTree( + std::pair, + std::unordered_set> query, + IndexConfiguration &config, std::string database_name, + concurrency::TransactionContext *txn) { + + LOG_TRACE("***** GetCostAndBestPlanTree **** \n"); + // Load the indexes into the cache for each table so that the optimizer uses + // the indexes that we provide. + for (auto table_name : query.second) { + // Load the tables into cache. + + // TODO: Hard coding the schema name for build to pass. + auto table_object = catalog::Catalog::GetInstance()->GetTableObject( + database_name, "public", table_name, txn); + + // Evict all the existing real indexes and + // insert the what-if indexes into the cache. + table_object->EvictAllIndexObjects(); + + // Upon evict index objects, the index set becomes + // invalid. Set it to valid so that we don't query + // the catalog again while doing query optimization later. + table_object->SetValidIndexObjects(true); + + auto index_set = config.GetIndexes(); + for (auto it = index_set.begin(); it != index_set.end(); it++) { + auto index = *it; + if (index->table_oid == table_object->GetTableOid()) { + auto index_catalog_obj = CreateIndexCatalogObject(index.get()); + table_object->InsertIndexObject(index_catalog_obj); + LOG_TRACE("Created a new hypothetical index %d on table: %d", + index_catalog_obj->GetIndexOid(), + index_catalog_obj->GetTableOid()); + for (auto col : index_catalog_obj->GetKeyAttrs()) { + (void)col; // for debug mode. + LOG_TRACE("Cols: %d", col); + } + } + } + } + + // Perform query optimization with the hypothetical indexes + optimizer::Optimizer optimizer; + auto opt_info_obj = optimizer.GetOptimizedPlanInfo(query.first, txn); + + LOG_TRACE("Query: %s", query.first->GetInfo().c_str()); + LOG_TRACE("Hypothetical config: %s", config.ToString().c_str()); + LOG_TRACE("Got cost %lf", opt_info_obj->cost); + LOG_TRACE("Plan type: %s", opt_info_obj->plan->GetInfo().c_str()); + return opt_info_obj; +} + +std::shared_ptr +WhatIfIndex::CreateIndexCatalogObject(HypotheticalIndexObject *index_obj) { + // Create an index name: + // index_____... + std::ostringstream index_name_oss; + index_name_oss << "index_" << index_obj->db_oid << "_" + << index_obj->table_oid; + for (auto it = index_obj->column_oids.begin(); + it != index_obj->column_oids.end(); it++) { + index_name_oss << (*it) << "_"; + } + // TODO: For now, we assume BW-TREE and DEFAULT index constraint type for the + // hypothetical indexes + // TODO: Support unique keys. + // Create a dummy catalog object. + auto col_oids = std::vector(index_obj->column_oids.begin(), + index_obj->column_oids.end()); + auto index_cat_obj = std::shared_ptr( + new catalog::IndexCatalogObject( + index_seq_no++, index_name_oss.str(), index_obj->table_oid, + IndexType::BWTREE, IndexConstraintType::DEFAULT, false, col_oids)); + return index_cat_obj; +} + +} // namespace brain +} // namespace peloton diff --git a/src/catalog/abstract_catalog.cpp b/src/catalog/abstract_catalog.cpp index 645e9c9d93f..9e250abc757 100644 --- a/src/catalog/abstract_catalog.cpp +++ b/src/catalog/abstract_catalog.cpp @@ -6,7 +6,7 @@ // // Identification: src/catalog/abstract_catalog.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -118,8 +118,8 @@ bool AbstractCatalog::InsertTuple(std::unique_ptr tuple, executor::ExecutionResult this_p_status; auto on_complete = [&this_p_status]( - executor::ExecutionResult p_status, - std::vector &&values UNUSED_ATTRIBUTE) { + executor::ExecutionResult p_status, + std::vector &&values UNUSED_ATTRIBUTE) { this_p_status = p_status; }; @@ -190,6 +190,26 @@ AbstractCatalog::GetResultWithIndexScan( std::vector column_offsets, oid_t index_offset, std::vector values, concurrency::TransactionContext *txn) const { + std::vector expr_types(values.size(), + ExpressionType::COMPARE_EQUAL); + return GetResultWithIndexScan(column_offsets, index_offset, values, + expr_types, txn); +} + +/*@brief Index scan helper function + * @param column_offsets Column ids for search (projection) + * @param index_offset Offset of index for scan + * @param values Values for search + * @param expr_types comparision expressions for the values + * @param txn TransactionContext + * @return Unique pointer of vector of logical tiles + */ +std::unique_ptr>> +AbstractCatalog::GetResultWithIndexScan( + const std::vector &column_offsets, const oid_t &index_offset, + const std::vector &values, + const std::vector &expr_types, + concurrency::TransactionContext *txn) const { if (txn == nullptr) throw CatalogException("Scan table requires transaction"); // Index scan @@ -200,8 +220,7 @@ AbstractCatalog::GetResultWithIndexScan( std::vector key_column_offsets = index->GetMetadata()->GetKeySchema()->GetIndexedColumns(); PELOTON_ASSERT(values.size() == key_column_offsets.size()); - std::vector expr_types(values.size(), - ExpressionType::COMPARE_EQUAL); + PELOTON_ASSERT(values.size() == expr_types.size()); std::vector runtime_keys; planner::IndexScanPlan::IndexScanDesc index_scan_desc( diff --git a/src/catalog/column_stats_catalog.cpp b/src/catalog/column_stats_catalog.cpp index bbe94340cdb..8d603483fa7 100644 --- a/src/catalog/column_stats_catalog.cpp +++ b/src/catalog/column_stats_catalog.cpp @@ -1,235 +1,237 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// column_stats_catalog.cpp -// -// Identification: src/catalog/column_stats_catalog.cpp -// -// Copyright (c) 2015-17, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include "catalog/column_stats_catalog.h" - -#include "catalog/catalog.h" -#include "executor/logical_tile.h" -#include "optimizer/stats/column_stats_collector.h" -#include "storage/data_table.h" -#include "storage/tuple.h" - -namespace peloton { -namespace catalog { - -ColumnStatsCatalog *ColumnStatsCatalog::GetInstance( - concurrency::TransactionContext *txn) { - static ColumnStatsCatalog column_stats_catalog{txn}; - return &column_stats_catalog; -} - -ColumnStatsCatalog::ColumnStatsCatalog(concurrency::TransactionContext *txn) - : AbstractCatalog("CREATE TABLE " CATALOG_DATABASE_NAME - "." CATALOG_SCHEMA_NAME "." COLUMN_STATS_CATALOG_NAME - " (" - "database_id INT NOT NULL, " - "table_id INT NOT NULL, " - "column_id INT NOT NULL, " - "num_rows INT NOT NULL, " - "cardinality DECIMAL NOT NULL, " - "frac_null DECIMAL NOT NULL, " - "most_common_vals VARCHAR, " - "most_common_freqs VARCHAR, " - "histogram_bounds VARCHAR, " - "column_name VARCHAR, " - "has_index BOOLEAN);", - txn) { - // unique key: (database_id, table_id, column_id) - Catalog::GetInstance()->CreateIndex( - CATALOG_DATABASE_NAME, CATALOG_SCHEMA_NAME, COLUMN_STATS_CATALOG_NAME, - {0, 1, 2}, COLUMN_STATS_CATALOG_NAME "_skey0", true, IndexType::BWTREE, - txn); - // non-unique key: (database_id, table_id) - Catalog::GetInstance()->CreateIndex( - CATALOG_DATABASE_NAME, CATALOG_SCHEMA_NAME, COLUMN_STATS_CATALOG_NAME, - {0, 1}, COLUMN_STATS_CATALOG_NAME "_skey1", false, IndexType::BWTREE, - txn); -} - -ColumnStatsCatalog::~ColumnStatsCatalog() {} - -bool ColumnStatsCatalog::InsertColumnStats( - oid_t database_id, oid_t table_id, oid_t column_id, int num_rows, - double cardinality, double frac_null, std::string most_common_vals, - std::string most_common_freqs, std::string histogram_bounds, - std::string column_name, bool has_index, type::AbstractPool *pool, - concurrency::TransactionContext *txn) { - std::unique_ptr tuple( - new storage::Tuple(catalog_table_->GetSchema(), true)); - - auto val_db_id = type::ValueFactory::GetIntegerValue(database_id); - auto val_table_id = type::ValueFactory::GetIntegerValue(table_id); - auto val_column_id = type::ValueFactory::GetIntegerValue(column_id); - auto val_num_row = type::ValueFactory::GetIntegerValue(num_rows); - auto val_cardinality = type::ValueFactory::GetDecimalValue(cardinality); - auto val_frac_null = type::ValueFactory::GetDecimalValue(frac_null); - - type::Value val_common_val, val_common_freq; - if (!most_common_vals.empty()) { - val_common_val = type::ValueFactory::GetVarcharValue(most_common_vals); - val_common_freq = type::ValueFactory::GetVarcharValue(most_common_freqs); - } else { - val_common_val = - type::ValueFactory::GetNullValueByType(type::TypeId::VARCHAR); - val_common_freq = - type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); - } - - type::Value val_hist_bounds; - if (!histogram_bounds.empty()) { - val_hist_bounds = type::ValueFactory::GetVarcharValue(histogram_bounds); - } else { - val_hist_bounds = - type::ValueFactory::GetNullValueByType(type::TypeId::VARCHAR); - } - - type::Value val_column_name = - type::ValueFactory::GetVarcharValue(column_name); - type::Value val_has_index = type::ValueFactory::GetBooleanValue(has_index); - - tuple->SetValue(ColumnId::DATABASE_ID, val_db_id, nullptr); - tuple->SetValue(ColumnId::TABLE_ID, val_table_id, nullptr); - tuple->SetValue(ColumnId::COLUMN_ID, val_column_id, nullptr); - tuple->SetValue(ColumnId::NUM_ROWS, val_num_row, nullptr); - tuple->SetValue(ColumnId::CARDINALITY, val_cardinality, nullptr); - tuple->SetValue(ColumnId::FRAC_NULL, val_frac_null, nullptr); - tuple->SetValue(ColumnId::MOST_COMMON_VALS, val_common_val, pool); - tuple->SetValue(ColumnId::MOST_COMMON_FREQS, val_common_freq, pool); - tuple->SetValue(ColumnId::HISTOGRAM_BOUNDS, val_hist_bounds, pool); - tuple->SetValue(ColumnId::COLUMN_NAME, val_column_name, pool); - tuple->SetValue(ColumnId::HAS_INDEX, val_has_index, nullptr); - - // Insert the tuple into catalog table - return InsertTuple(std::move(tuple), txn); -} - -bool ColumnStatsCatalog::DeleteColumnStats( - oid_t database_id, oid_t table_id, oid_t column_id, - concurrency::TransactionContext *txn) { - oid_t index_offset = IndexId::SECONDARY_KEY_0; // Secondary key index - - std::vector values; - values.push_back(type::ValueFactory::GetIntegerValue(database_id).Copy()); - values.push_back(type::ValueFactory::GetIntegerValue(table_id).Copy()); - values.push_back(type::ValueFactory::GetIntegerValue(column_id).Copy()); - - return DeleteWithIndexScan(index_offset, values, txn); -} - -std::unique_ptr> ColumnStatsCatalog::GetColumnStats( - oid_t database_id, oid_t table_id, oid_t column_id, - concurrency::TransactionContext *txn) { - std::vector column_ids( - {ColumnId::NUM_ROWS, ColumnId::CARDINALITY, ColumnId::FRAC_NULL, - ColumnId::MOST_COMMON_VALS, ColumnId::MOST_COMMON_FREQS, - ColumnId::HISTOGRAM_BOUNDS, ColumnId::COLUMN_NAME, ColumnId::HAS_INDEX}); - oid_t index_offset = IndexId::SECONDARY_KEY_0; // Secondary key index - - std::vector values; - values.push_back(type::ValueFactory::GetIntegerValue(database_id).Copy()); - values.push_back(type::ValueFactory::GetIntegerValue(table_id).Copy()); - values.push_back(type::ValueFactory::GetIntegerValue(column_id).Copy()); - - auto result_tiles = - GetResultWithIndexScan(column_ids, index_offset, values, txn); - - PELOTON_ASSERT(result_tiles->size() <= 1); // unique - if (result_tiles->size() == 0) { - return nullptr; - } - - auto tile = (*result_tiles)[0].get(); - PELOTON_ASSERT(tile->GetTupleCount() <= 1); - if (tile->GetTupleCount() == 0) { - return nullptr; - } - - type::Value num_rows, cardinality, frac_null, most_common_vals, - most_common_freqs, hist_bounds, column_name, has_index; - - num_rows = tile->GetValue(0, ColumnStatsOffset::NUM_ROWS_OFF); - cardinality = tile->GetValue(0, ColumnStatsOffset::CARDINALITY_OFF); - frac_null = tile->GetValue(0, ColumnStatsOffset::FRAC_NULL_OFF); - most_common_vals = tile->GetValue(0, ColumnStatsOffset::COMMON_VALS_OFF); - most_common_freqs = tile->GetValue(0, ColumnStatsOffset::COMMON_FREQS_OFF); - hist_bounds = tile->GetValue(0, ColumnStatsOffset::HIST_BOUNDS_OFF); - column_name = tile->GetValue(0, ColumnStatsOffset::COLUMN_NAME_OFF); - has_index = tile->GetValue(0, ColumnStatsOffset::HAS_INDEX_OFF); - - std::unique_ptr> column_stats( - new std::vector({num_rows, cardinality, frac_null, - most_common_vals, most_common_freqs, - hist_bounds, column_name, has_index})); - - return column_stats; -} - -// Return value: number of column stats -size_t ColumnStatsCatalog::GetTableStats( - oid_t database_id, oid_t table_id, concurrency::TransactionContext *txn, - std::map>> - &column_stats_map) { - std::vector column_ids( - {ColumnId::COLUMN_ID, ColumnId::NUM_ROWS, ColumnId::CARDINALITY, - ColumnId::FRAC_NULL, ColumnId::MOST_COMMON_VALS, - ColumnId::MOST_COMMON_FREQS, ColumnId::HISTOGRAM_BOUNDS, - ColumnId::COLUMN_NAME, ColumnId::HAS_INDEX}); - oid_t index_offset = IndexId::SECONDARY_KEY_1; // Secondary key index - - std::vector values; - values.push_back(type::ValueFactory::GetIntegerValue(database_id).Copy()); - values.push_back(type::ValueFactory::GetIntegerValue(table_id).Copy()); - - auto result_tiles = - GetResultWithIndexScan(column_ids, index_offset, values, txn); - - PELOTON_ASSERT(result_tiles->size() <= 1); // unique - if (result_tiles->size() == 0) { - return 0; - } - auto tile = (*result_tiles)[0].get(); - size_t tuple_count = tile->GetTupleCount(); - LOG_DEBUG("Tuple count: %lu", tuple_count); - if (tuple_count == 0) { - return 0; - } - - type::Value num_rows, cardinality, frac_null, most_common_vals, - most_common_freqs, hist_bounds, column_name, has_index; - for (size_t tuple_id = 0; tuple_id < tuple_count; ++tuple_id) { - num_rows = tile->GetValue(tuple_id, 1 + ColumnStatsOffset::NUM_ROWS_OFF); - cardinality = - tile->GetValue(tuple_id, 1 + ColumnStatsOffset::CARDINALITY_OFF); - frac_null = tile->GetValue(tuple_id, 1 + ColumnStatsOffset::FRAC_NULL_OFF); - most_common_vals = - tile->GetValue(tuple_id, 1 + ColumnStatsOffset::COMMON_VALS_OFF); - most_common_freqs = - tile->GetValue(tuple_id, 1 + ColumnStatsOffset::COMMON_FREQS_OFF); - hist_bounds = - tile->GetValue(tuple_id, 1 + ColumnStatsOffset::HIST_BOUNDS_OFF); - column_name = - tile->GetValue(tuple_id, 1 + ColumnStatsOffset::COLUMN_NAME_OFF); - has_index = tile->GetValue(tuple_id, 1 + ColumnStatsOffset::HAS_INDEX_OFF); - - std::unique_ptr> column_stats( - new std::vector({num_rows, cardinality, frac_null, - most_common_vals, most_common_freqs, - hist_bounds, column_name, has_index})); - - oid_t column_id = tile->GetValue(tuple_id, 0).GetAs(); - column_stats_map[column_id] = std::move(column_stats); - } - return tuple_count; -} - -} // namespace catalog -} // namespace peloton +//===----------------------------------------------------------------------===// +// +// Peloton +// +// column_stats_catalog.cpp +// +// Identification: src/catalog/column_stats_catalog.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "catalog/column_stats_catalog.h" + +#include "catalog/catalog.h" +#include "executor/logical_tile.h" +#include "optimizer/stats/column_stats_collector.h" +#include "storage/data_table.h" +#include "storage/tuple.h" + +namespace peloton { +namespace catalog { + +ColumnStatsCatalog *ColumnStatsCatalog::GetInstance( + concurrency::TransactionContext *txn) { + static ColumnStatsCatalog column_stats_catalog{txn}; + return &column_stats_catalog; +} + +// TODO [VAMSHI]: Removing the NOT NULL contraints for benchmark results. +// Enable it later +ColumnStatsCatalog::ColumnStatsCatalog(concurrency::TransactionContext *txn) + : AbstractCatalog("CREATE TABLE " CATALOG_DATABASE_NAME + "." CATALOG_SCHEMA_NAME "." COLUMN_STATS_CATALOG_NAME + " (" + "database_id INT, " + "table_id INT, " + "column_id INT, " + "num_rows INT, " + "cardinality DECIMAL, " + "frac_null DECIMAL, " + "most_common_vals VARCHAR, " + "most_common_freqs VARCHAR, " + "histogram_bounds VARCHAR, " + "column_name VARCHAR, " + "has_index BOOLEAN);", + txn) { + // unique key: (database_id, table_id, column_id) + Catalog::GetInstance()->CreateIndex( + CATALOG_DATABASE_NAME, CATALOG_SCHEMA_NAME, COLUMN_STATS_CATALOG_NAME, + {0, 1, 2}, COLUMN_STATS_CATALOG_NAME "_skey0", true, IndexType::BWTREE, + txn); + // non-unique key: (database_id, table_id) + Catalog::GetInstance()->CreateIndex( + CATALOG_DATABASE_NAME, CATALOG_SCHEMA_NAME, COLUMN_STATS_CATALOG_NAME, + {0, 1}, COLUMN_STATS_CATALOG_NAME "_skey1", false, IndexType::BWTREE, + txn); +} + +ColumnStatsCatalog::~ColumnStatsCatalog() {} + +bool ColumnStatsCatalog::InsertColumnStats( + oid_t database_id, oid_t table_id, oid_t column_id, int num_rows, + double cardinality, double frac_null, std::string most_common_vals, + std::string most_common_freqs, std::string histogram_bounds, + std::string column_name, bool has_index, type::AbstractPool *pool, + concurrency::TransactionContext *txn) { + std::unique_ptr tuple( + new storage::Tuple(catalog_table_->GetSchema(), true)); + + auto val_db_id = type::ValueFactory::GetIntegerValue(database_id); + auto val_table_id = type::ValueFactory::GetIntegerValue(table_id); + auto val_column_id = type::ValueFactory::GetIntegerValue(column_id); + auto val_num_row = type::ValueFactory::GetIntegerValue(num_rows); + auto val_cardinality = type::ValueFactory::GetDecimalValue(cardinality); + auto val_frac_null = type::ValueFactory::GetDecimalValue(frac_null); + + type::Value val_common_val, val_common_freq; + if (!most_common_vals.empty()) { + val_common_val = type::ValueFactory::GetVarcharValue(most_common_vals); + val_common_freq = type::ValueFactory::GetVarcharValue(most_common_freqs); + } else { + val_common_val = + type::ValueFactory::GetNullValueByType(type::TypeId::VARCHAR); + val_common_freq = + type::ValueFactory::GetNullValueByType(type::TypeId::DECIMAL); + } + + type::Value val_hist_bounds; + if (!histogram_bounds.empty()) { + val_hist_bounds = type::ValueFactory::GetVarcharValue(histogram_bounds); + } else { + val_hist_bounds = + type::ValueFactory::GetNullValueByType(type::TypeId::VARCHAR); + } + + type::Value val_column_name = + type::ValueFactory::GetVarcharValue(column_name); + type::Value val_has_index = type::ValueFactory::GetBooleanValue(has_index); + + tuple->SetValue(ColumnId::DATABASE_ID, val_db_id, nullptr); + tuple->SetValue(ColumnId::TABLE_ID, val_table_id, nullptr); + tuple->SetValue(ColumnId::COLUMN_ID, val_column_id, nullptr); + tuple->SetValue(ColumnId::NUM_ROWS, val_num_row, nullptr); + tuple->SetValue(ColumnId::CARDINALITY, val_cardinality, nullptr); + tuple->SetValue(ColumnId::FRAC_NULL, val_frac_null, nullptr); + tuple->SetValue(ColumnId::MOST_COMMON_VALS, val_common_val, pool); + tuple->SetValue(ColumnId::MOST_COMMON_FREQS, val_common_freq, pool); + tuple->SetValue(ColumnId::HISTOGRAM_BOUNDS, val_hist_bounds, pool); + tuple->SetValue(ColumnId::COLUMN_NAME, val_column_name, pool); + tuple->SetValue(ColumnId::HAS_INDEX, val_has_index, nullptr); + + // Insert the tuple into catalog table + return InsertTuple(std::move(tuple), txn); +} + +bool ColumnStatsCatalog::DeleteColumnStats( + oid_t database_id, oid_t table_id, oid_t column_id, + concurrency::TransactionContext *txn) { + oid_t index_offset = IndexId::SECONDARY_KEY_0; // Secondary key index + + std::vector values; + values.push_back(type::ValueFactory::GetIntegerValue(database_id).Copy()); + values.push_back(type::ValueFactory::GetIntegerValue(table_id).Copy()); + values.push_back(type::ValueFactory::GetIntegerValue(column_id).Copy()); + + return DeleteWithIndexScan(index_offset, values, txn); +} + +std::unique_ptr> ColumnStatsCatalog::GetColumnStats( + oid_t database_id, oid_t table_id, oid_t column_id, + concurrency::TransactionContext *txn) { + std::vector column_ids( + {ColumnId::NUM_ROWS, ColumnId::CARDINALITY, ColumnId::FRAC_NULL, + ColumnId::MOST_COMMON_VALS, ColumnId::MOST_COMMON_FREQS, + ColumnId::HISTOGRAM_BOUNDS, ColumnId::COLUMN_NAME, ColumnId::HAS_INDEX}); + oid_t index_offset = IndexId::SECONDARY_KEY_0; // Secondary key index + + std::vector values; + values.push_back(type::ValueFactory::GetIntegerValue(database_id).Copy()); + values.push_back(type::ValueFactory::GetIntegerValue(table_id).Copy()); + values.push_back(type::ValueFactory::GetIntegerValue(column_id).Copy()); + + auto result_tiles = + GetResultWithIndexScan(column_ids, index_offset, values, txn); + + PELOTON_ASSERT(result_tiles->size() <= 1); // unique + if (result_tiles->size() == 0) { + return nullptr; + } + + auto tile = (*result_tiles)[0].get(); + PELOTON_ASSERT(tile->GetTupleCount() <= 1); + if (tile->GetTupleCount() == 0) { + return nullptr; + } + + type::Value num_rows, cardinality, frac_null, most_common_vals, + most_common_freqs, hist_bounds, column_name, has_index; + + num_rows = tile->GetValue(0, ColumnStatsOffset::NUM_ROWS_OFF); + cardinality = tile->GetValue(0, ColumnStatsOffset::CARDINALITY_OFF); + frac_null = tile->GetValue(0, ColumnStatsOffset::FRAC_NULL_OFF); + most_common_vals = tile->GetValue(0, ColumnStatsOffset::COMMON_VALS_OFF); + most_common_freqs = tile->GetValue(0, ColumnStatsOffset::COMMON_FREQS_OFF); + hist_bounds = tile->GetValue(0, ColumnStatsOffset::HIST_BOUNDS_OFF); + column_name = tile->GetValue(0, ColumnStatsOffset::COLUMN_NAME_OFF); + has_index = tile->GetValue(0, ColumnStatsOffset::HAS_INDEX_OFF); + + std::unique_ptr> column_stats( + new std::vector({num_rows, cardinality, frac_null, + most_common_vals, most_common_freqs, + hist_bounds, column_name, has_index})); + + return column_stats; +} + +// Return value: number of column stats +size_t ColumnStatsCatalog::GetTableStats( + oid_t database_id, oid_t table_id, concurrency::TransactionContext *txn, + std::map>> + &column_stats_map) { + std::vector column_ids( + {ColumnId::COLUMN_ID, ColumnId::NUM_ROWS, ColumnId::CARDINALITY, + ColumnId::FRAC_NULL, ColumnId::MOST_COMMON_VALS, + ColumnId::MOST_COMMON_FREQS, ColumnId::HISTOGRAM_BOUNDS, + ColumnId::COLUMN_NAME, ColumnId::HAS_INDEX}); + oid_t index_offset = IndexId::SECONDARY_KEY_1; // Secondary key index + + std::vector values; + values.push_back(type::ValueFactory::GetIntegerValue(database_id).Copy()); + values.push_back(type::ValueFactory::GetIntegerValue(table_id).Copy()); + + auto result_tiles = + GetResultWithIndexScan(column_ids, index_offset, values, txn); + + PELOTON_ASSERT(result_tiles->size() <= 1); // unique + if (result_tiles->size() == 0) { + return 0; + } + auto tile = (*result_tiles)[0].get(); + size_t tuple_count = tile->GetTupleCount(); + LOG_TRACE("Tuple count: %lu", tuple_count); + if (tuple_count == 0) { + return 0; + } + + type::Value num_rows, cardinality, frac_null, most_common_vals, + most_common_freqs, hist_bounds, column_name, has_index; + for (size_t tuple_id = 0; tuple_id < tuple_count; ++tuple_id) { + num_rows = tile->GetValue(tuple_id, 1 + ColumnStatsOffset::NUM_ROWS_OFF); + cardinality = + tile->GetValue(tuple_id, 1 + ColumnStatsOffset::CARDINALITY_OFF); + frac_null = tile->GetValue(tuple_id, 1 + ColumnStatsOffset::FRAC_NULL_OFF); + most_common_vals = + tile->GetValue(tuple_id, 1 + ColumnStatsOffset::COMMON_VALS_OFF); + most_common_freqs = + tile->GetValue(tuple_id, 1 + ColumnStatsOffset::COMMON_FREQS_OFF); + hist_bounds = + tile->GetValue(tuple_id, 1 + ColumnStatsOffset::HIST_BOUNDS_OFF); + column_name = + tile->GetValue(tuple_id, 1 + ColumnStatsOffset::COLUMN_NAME_OFF); + has_index = tile->GetValue(tuple_id, 1 + ColumnStatsOffset::HAS_INDEX_OFF); + + std::unique_ptr> column_stats( + new std::vector({num_rows, cardinality, frac_null, + most_common_vals, most_common_freqs, + hist_bounds, column_name, has_index})); + + oid_t column_id = tile->GetValue(tuple_id, 0).GetAs(); + column_stats_map[column_id] = std::move(column_stats); + } + return tuple_count; +} + +} // namespace catalog +} // namespace peloton diff --git a/src/catalog/index_catalog.cpp b/src/catalog/index_catalog.cpp index da666f36f60..50273bce07f 100644 --- a/src/catalog/index_catalog.cpp +++ b/src/catalog/index_catalog.cpp @@ -6,7 +6,7 @@ // // Identification: src/catalog/index_catalog.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Index Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -54,6 +54,19 @@ IndexCatalogObject::IndexCatalogObject(executor::LogicalTile *tile, int tupleId) LOG_TRACE("the size for indexed key is %lu", key_attrs.size()); } +IndexCatalogObject::IndexCatalogObject(oid_t index_oid, std::string index_name, + oid_t table_oid, IndexType index_type, + IndexConstraintType index_constraint, + bool unique_keys, + std::vector &key_attrs) + : index_oid(index_oid), + index_name(index_name), + table_oid(table_oid), + index_type(index_type), + index_constraint(index_constraint), + unique_keys(unique_keys), + key_attrs(std::vector(key_attrs.begin(), key_attrs.end())) {} + IndexCatalog::IndexCatalog( storage::Database *pg_catalog, UNUSED_ATTRIBUTE type::AbstractPool *pool, UNUSED_ATTRIBUTE concurrency::TransactionContext *txn) @@ -213,7 +226,7 @@ std::shared_ptr IndexCatalog::GetIndexObject( auto table_object = pg_table->GetTableObject(index_object->GetTableOid(), txn); PELOTON_ASSERT(table_object && - table_object->GetTableOid() == index_object->GetTableOid()); + table_object->GetTableOid() == index_object->GetTableOid()); return table_object->GetIndexObject(index_oid); } else { LOG_DEBUG("Found %lu index with oid %u", result_tiles->size(), index_oid); @@ -259,7 +272,7 @@ std::shared_ptr IndexCatalog::GetIndexObject( auto table_object = pg_table->GetTableObject(index_object->GetTableOid(), txn); PELOTON_ASSERT(table_object && - table_object->GetTableOid() == index_object->GetTableOid()); + table_object->GetTableOid() == index_object->GetTableOid()); return table_object->GetIndexObject(index_name); } else { LOG_DEBUG("Found %lu index with name %s", result_tiles->size(), @@ -270,6 +283,28 @@ std::shared_ptr IndexCatalog::GetIndexObject( return nullptr; } +std::unordered_map> +IndexCatalog::GetIndexObjects(concurrency::TransactionContext *txn) { + std::unordered_map> result_indexes; + if (txn == nullptr) { + throw CatalogException("Transaction is invalid!"); + } + // try get from cache + auto pg_table = Catalog::GetInstance() + ->GetSystemCatalogs(database_oid) + ->GetTableCatalog(); + auto table_objects = pg_table->GetTableObjects(txn); + if (!table_objects.empty()) { + for (auto table_obj : table_objects) { + auto index_objects = GetIndexObjects(table_obj.first, txn); + for (auto index_obj : index_objects) { + result_indexes[index_obj.first] = index_obj.second; + } + } + } + return result_indexes; +} + /*@brief get all index records from the same table * this function may be useful when calling DropTable * @param table_oid diff --git a/src/catalog/query_history_catalog.cpp b/src/catalog/query_history_catalog.cpp index 4433197ba28..8dc280b492a 100644 --- a/src/catalog/query_history_catalog.cpp +++ b/src/catalog/query_history_catalog.cpp @@ -14,7 +14,7 @@ #include "catalog/catalog.h" #include "storage/data_table.h" -#include "type/value_factory.h" +#include "executor/logical_tile.h" namespace peloton { namespace catalog { @@ -32,7 +32,12 @@ QueryHistoryCatalog::QueryHistoryCatalog(concurrency::TransactionContext *txn) "query_string VARCHAR NOT NULL, " "fingerprint VARCHAR NOT NULL, " "timestamp TIMESTAMP NOT NULL);", - txn) {} + txn) { + // Secondary index on timestamp + Catalog::GetInstance()->CreateIndex( + CATALOG_DATABASE_NAME, CATALOG_SCHEMA_NAME, QUERY_HISTORY_CATALOG_NAME, + {2}, QUERY_HISTORY_CATALOG_NAME "_skey0", false, IndexType::BWTREE, txn); +} QueryHistoryCatalog::~QueryHistoryCatalog() = default; @@ -56,5 +61,40 @@ bool QueryHistoryCatalog::InsertQueryHistory( return InsertTuple(std::move(tuple), txn); } +std::unique_ptr>> +QueryHistoryCatalog::GetQueryStringsAfterTimestamp( + const uint64_t start_timestamp, concurrency::TransactionContext *txn) { + LOG_INFO("Start querying.... %" PRId64, start_timestamp); + // Get both timestamp and query string in the result. + std::vector column_ids({ColumnId::TIMESTAMP, ColumnId::QUERY_STRING}); + oid_t index_offset = IndexId::SECONDARY_KEY_0; // Secondary key index + + std::vector values; + values.push_back(type::ValueFactory::GetTimestampValue( + static_cast(start_timestamp))); + + std::vector expr_types(values.size(), + ExpressionType::COMPARE_GREATERTHAN); + + auto result_tiles = + GetResultWithIndexScan(column_ids, index_offset, values, expr_types, txn); + + std::unique_ptr>> queries( + new std::vector>()); + if (result_tiles->size() > 0) { + for (auto &tile : *result_tiles.get()) { + PELOTON_ASSERT(tile->GetColumnCount() == column_ids.size()); + for (auto i = 0UL; i < tile->GetTupleCount(); i++) { + auto timestamp = tile->GetValue(i, 0).GetAs(); + auto query_string = tile->GetValue(i, 1).GetAs(); + auto pair = std::make_pair(timestamp, query_string); + LOG_INFO("Query: %" PRId64 ": %s", pair.first, pair.second); + queries->emplace_back(pair); + } + } + } + return queries; +} + } // namespace catalog } // namespace peloton diff --git a/src/catalog/table_catalog.cpp b/src/catalog/table_catalog.cpp index 34ef723e366..db681f8a704 100644 --- a/src/catalog/table_catalog.cpp +++ b/src/catalog/table_catalog.cpp @@ -6,7 +6,7 @@ // // Identification: src/catalog/table_catalog.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -126,6 +126,16 @@ void TableCatalogObject::EvictAllIndexObjects() { valid_index_objects = false; } +/* + * @brief Sets the index objects to be invalid. + * This is useful in what-if API to avoid querying + * the catalog again by setting is_valid to true. + * @param is_valid + */ +void TableCatalogObject::SetValidIndexObjects(bool is_valid) { + valid_index_objects = is_valid; +} + /* @brief get all index objects of this table into cache * @return map from index oid to cached index object */ diff --git a/src/include/brain/brain.h b/src/include/brain/brain.h index 6614767423b..59b43e1fddf 100644 --- a/src/include/brain/brain.h +++ b/src/include/brain/brain.h @@ -19,6 +19,7 @@ #include "capnp/ez-rpc.h" #include "peloton/capnp/peloton_service.capnp.h" #include "common/notifiable_task.h" +#include "brain/index_selection_util.h" namespace peloton { namespace brain { @@ -28,7 +29,15 @@ namespace brain { * the brain, such as RPC and Catalog. */ class BrainEnvironment { - // TODO(tianyu): fill in as needed + public: + BrainEnvironment() { index_selection_knobs = {3, 3, 10}; } + IndexSelectionKnobs GetIndexSelectionKnobs() { return index_selection_knobs; } + void SetIndexSelectionKnobs(IndexSelectionKnobs knobs) { + index_selection_knobs = knobs; + } + + private: + IndexSelectionKnobs index_selection_knobs; }; /** @@ -55,6 +64,7 @@ class BrainJob { * provided BrainEnvironment for interaction with Brain's resources. */ virtual void OnJobInvocation(BrainEnvironment *) = 0; + private: BrainEnvironment *env_; }; @@ -68,6 +78,7 @@ class SimpleBrainJob : public BrainJob { std::function task) : BrainJob(env), task_(std::move(task)) {} inline void OnJobInvocation(BrainEnvironment *env) override { task_(env); } + private: std::function task_; }; @@ -83,13 +94,12 @@ class Brain { Brain() : scheduler_(0) {} ~Brain() { - for (auto entry : jobs_) - delete entry.second; + for (auto entry : jobs_) delete entry.second; } template - inline void RegisterJob(const struct timeval *period, - std::string name, Args... args) { + inline void RegisterJob(const struct timeval *period, std::string name, + Args... args) { auto *job = new BrainJob(&env_, args...); jobs_[name] = job; auto callback = [](int, short, void *arg) { @@ -99,13 +109,9 @@ class Brain { scheduler_.RegisterPeriodicEvent(period, callback, job); } - inline void Run() { - scheduler_.EventLoop(); - } + inline void Run() { scheduler_.EventLoop(); } - inline void Terminate() { - scheduler_.ExitLoop(); - } + inline void Terminate() { scheduler_.ExitLoop(); } private: NotifiableTask scheduler_; @@ -113,5 +119,5 @@ class Brain { std::unordered_map job_handles_; BrainEnvironment env_; }; -} // namespace brain -} // namespace peloton +} // namespace brain +} // namespace peloton diff --git a/src/include/brain/index_selection.h b/src/include/brain/index_selection.h new file mode 100644 index 00000000000..822b5e1385f --- /dev/null +++ b/src/include/brain/index_selection.h @@ -0,0 +1,228 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// index_selection.h +// +// Identification: src/include/brain/index_selection.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "brain/index_selection_context.h" +#include "brain/index_selection_util.h" +#include "catalog/index_catalog.h" +#include "expression/tuple_value_expression.h" +#include "parser/sql_statement.h" + +namespace peloton { +namespace brain { + +/** + * @brief Comparator for set of (Index Configuration, Cost) + */ +struct IndexConfigComparator { + IndexConfigComparator(Workload &workload) { this->w = &workload; } + bool operator()(const std::pair &s1, + const std::pair &s2) const { + // Order by cost. If cost is same, then by the number of indexes + // Unless the configuration is exactly the same, get some ordering + + if (s1.second < s2.second) { + return true; + } else if (s1.second > s2.second) { + return false; + } else { + if (s1.first.GetIndexCount() > s2.first.GetIndexCount()) { + return true; + } else if (s1.first.GetIndexCount() < s2.first.GetIndexCount()) { + return false; + } else { + // TODO[Siva]: Change this to a better one, choose the one with bigger/ + // smaller indexes + return (s1.first.ToString() < s2.first.ToString()); + } + } + } + + Workload *w; +}; + +//===--------------------------------------------------------------------===// +// IndexSelection +//===--------------------------------------------------------------------===// + +class IndexSelection { + public: + /** + * IndexSelection + * + * @param query_set set of queries as a workload + * @param knobs the tunable parameters of the algorithm that includes + * number of indexes to be chosen, threshold for naive enumeration, + * maximum number of columns in each index. + */ + IndexSelection(Workload &query_set, IndexSelectionKnobs knobs, + concurrency::TransactionContext *txn); + + /** + * @brief The main external API for the Index Prediction Tool + * @returns The best possible Index Congurations for the workload + */ + void GetBestIndexes(IndexConfiguration &final_indexes); + + /** + * @brief Gets the indexable columns of a given query + */ + void GetAdmissibleIndexes(std::shared_ptr query, + IndexConfiguration &indexes); + + /** + * @brief GenerateCandidateIndexes. + * If the admissible config set is empty, generate + * the single-column (admissible) indexes for each query from the provided + * queries and prune the useless ones. This becomes candidate index set. If + * not empty, prune the useless indexes from the candidate set for the given + * workload. + * + * @param candidate_config - new candidate index to be pruned. + * @param admissible_config - admissible index set of the queries + * @param workload - queries + */ + void GenerateCandidateIndexes(IndexConfiguration &candidate_config, + IndexConfiguration &admissible_config, + Workload &workload); + + /** + * @brief gets the top k indexes for the workload which would reduce the cost + * of executing them + * + * @param indexes - the indexes in the workload + * @param top_indexes - the top k cheapest indexes in the workload are + * returned through this parameter + * @param workload - the given workload + * @param k - the number of indexes to return + */ + void Enumerate(IndexConfiguration &indexes, IndexConfiguration &top_indexes, + Workload &workload, size_t k); + + /** + * @brief generate multi-column indexes from the single column indexes by + * doing a cross product and adds it into the result. + * + * @param config - the set of candidate indexes chosen after the enumeration + * @param single_column_indexes - the set of admissible single column indexes + * @param result - return the set of multi column indexes + */ + void GenerateMultiColumnIndexes(IndexConfiguration &config, + IndexConfiguration &single_column_indexes, + IndexConfiguration &result); + + /** + * @brief Add a given configuration to the IndexObject pool + * return the corresponding shared pointer if the object already exists in + * the pool. Otherwise create one and return. + * Currently, this is used only for unit testing + */ + std::shared_ptr AddConfigurationToPool( + HypotheticalIndexObject object); + + private: + /** + * @brief PruneUselessIndexes + * Delete the indexes from the configuration which do not help at least one of + * the queries in the workload + * + * @param config - index set + * @param workload - queries + * @param pruned_config - result configuration + */ + void PruneUselessIndexes(IndexConfiguration &config, Workload &workload, + IndexConfiguration &pruned_config); + + /** + * @brief Gets the cost of an index configuration for a given workload. It + * would call the What-If API appropriately and stores the results in the memo + * table + */ + double ComputeCost(IndexConfiguration &config, Workload &workload); + + // Configuration Enumeration related + /** + * @brief Gets the cheapest indexes through naive exhaustive enumeration by + * generating all possible subsets of size <= m where m is a tunable parameter + */ + void ExhaustiveEnumeration(IndexConfiguration &indexes, + IndexConfiguration &top_indexes, + Workload &workload); + + /** + * @brief Gets the remaining cheapest indexes through greedy search + */ + void GreedySearch(IndexConfiguration &indexes, + IndexConfiguration &remaining_indexes, Workload &workload, + size_t num_indexes); + + // Admissible index selection related + /** + * @brief Helper to parse the order where in the SQL statements such as + * select, delete, update. + */ + void IndexColsParseWhereHelper( + const expression::AbstractExpression *where_expr, + IndexConfiguration &config); + + /** + * @brief Helper to parse the group by clause in the SQL statements such as + * select, delete, update. + */ + void IndexColsParseGroupByHelper( + std::unique_ptr &where_expr, + IndexConfiguration &config); + + /** + * @brief Helper to parse the order by clause in the SQL statements such as + * select, delete, update. + */ + void IndexColsParseOrderByHelper( + std::unique_ptr &order_by, + IndexConfiguration &config); + + /** + * @brief Helper function to convert a tuple of + * to an IndexObject and store into the IndexObject shared pool. + * + * @param - tuple_col: representation of a column + * @param - config: returns a new index object here + */ + void IndexObjectPoolInsertHelper( + const std::tuple &tuple_col, + IndexConfiguration &config); + + /** + * @brief Create a new index configuration which is a cross product of the + * given configurations and merge it into the result. + * result = result union (configuration1 * configuration2) + * Ex: {I1} * {I23, I45} = {I123, I145} + * + * @param - configuration1: config1 + * @param - configuration2: config2 + * @param - result: cross product + */ + void CrossProduct(const IndexConfiguration &configuration1, + const IndexConfiguration &configuration2, + IndexConfiguration &result); + + // Set of parsed and bound queries + Workload query_set_; + // Common context of index selection object. + IndexSelectionContext context_; + // Transaction. + concurrency::TransactionContext *txn_; +}; + +} // namespace brain +} // namespace peloton diff --git a/src/include/brain/index_selection_context.h b/src/include/brain/index_selection_context.h new file mode 100644 index 00000000000..2f11f6ff3ea --- /dev/null +++ b/src/include/brain/index_selection_context.h @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// index_selection_context.h +// +// Identification: src/include/brain/index_selection_context.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include "brain/index_selection_util.h" + +namespace parser { +class SQLStatement; +} + +namespace peloton { +namespace brain { + +// Hasher for the KeyType of the memo used for cost evalutation +struct KeyHasher { + std::size_t operator()( + const std::pair &key) const { + auto indexes = key.first.GetIndexes(); + // TODO[Siva]: Can we do better? + auto result = std::hash()(key.second->GetInfo()); + for (auto index : indexes) { + // TODO[Siva]: Use IndexObjectHasher to hash this + result ^= std::hash()(index->ToString()); + } + return result; + } +}; + +//===--------------------------------------------------------------------===// +// IndexSelectionContext +//===--------------------------------------------------------------------===// + +class IndexSelectionContext { + public: + /** + * @brief Constructor + * + */ + IndexSelectionContext(IndexSelectionKnobs knobs); + + private: + friend class IndexSelection; + + // memoization of the cost of a query for a given configuration + std::unordered_map, + double, KeyHasher> memo_; + // map from index configuration to the sharedpointer of the + // IndexConfiguration object + IndexObjectPool pool_; + + // The knobs for this run of the algorithm + IndexSelectionKnobs knobs_; +}; + +} // namespace brain +} // namespace peloton diff --git a/src/include/brain/index_selection_job.h b/src/include/brain/index_selection_job.h new file mode 100644 index 00000000000..374c978b234 --- /dev/null +++ b/src/include/brain/index_selection_job.h @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// index_selection_job.h +// +// Identification: src/include/brain/index_selection_job.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once +#include "brain.h" +#include "brain/index_selection_util.h" + +namespace peloton { + +namespace brain { +class IndexSelectionJob : public BrainJob { + public: + IndexSelectionJob(BrainEnvironment *env, uint64_t num_queries_threshold) + : BrainJob(env), + last_timestamp_(0), + num_queries_threshold_(num_queries_threshold) {} + const std::string brain_suggested_index_prefix_str = "brain_suggested_index"; + + /** + * Task function. + * @param env + */ + void OnJobInvocation(BrainEnvironment *env); + + private: + /** + * Go through the queries and return the timestamp of the latest query. + * @return latest timestamp + */ + static uint64_t GetLatestQueryTimestamp( + std::vector> *); + /** + * Sends an RPC message to server for creating indexes. + * @param table_name + * @param keys + */ + void CreateIndexRPC(brain::HypotheticalIndexObject *index); + + /** + * Finds current indexes - suggested indexes. + * @param cur_indexes + * @param best_config + * @return indexes that are not useful and to be dropped. + */ + std::vector> GetIndexesToDrop( + std::unordered_map> + &cur_indexes, + brain::IndexConfiguration best_config); + + /** + * Sends an RPC message to server for drop indexes. + * @param index + */ + void DropIndexRPC(oid_t database_oid, catalog::IndexCatalogObject *index); + + /** + * Timestamp of the latest query of the recently processed + * query workload. + */ + uint64_t last_timestamp_; + /** + * Tuning threshold in terms of queries + * Run the index suggestion only if the number of new queries + * in the workload exceeds this number + */ + uint64_t num_queries_threshold_; +}; +} // peloton brain + +} // namespace peloton diff --git a/src/include/brain/index_selection_util.h b/src/include/brain/index_selection_util.h new file mode 100644 index 00000000000..eb52194d910 --- /dev/null +++ b/src/include/brain/index_selection_util.h @@ -0,0 +1,297 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// index_selection_util.h +// +// Identification: src/include/brain/index_selection_util.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include "binder/bind_node_visitor.h" +#include "catalog/index_catalog.h" +#include "concurrency/transaction_manager_factory.h" +#include "parser/sql_statement.h" +#include "parser/postgresparser.h" +#include "concurrency/transaction_context.h" + +namespace peloton { +namespace brain { + +//===--------------------------------------------------------------------===// +// IndexSuggestionKnobs +//===--------------------------------------------------------------------===// + +// Tunable knobs of the index selection algorithm +struct IndexSelectionKnobs { + // The number of iterations of the main algorithm which is also the maximum + // number of columns in a single index as in ith iteration we consider indexes + // with i or lesser columns + size_t num_iterations_; + // The number of indexes up to which we will do exhaustive enumeration + size_t naive_enumeration_threshold_; + // The number of indexes in the final configuration returned by the + // IndexSelection algorithm + size_t num_indexes_; +}; + +//===--------------------------------------------------------------------===// +// IndexObject +//===--------------------------------------------------------------------===// + +// Class to represent a (hypothetical) index +struct HypotheticalIndexObject { + // the OID of the database + oid_t db_oid; + // the OID of the table + oid_t table_oid; + // OIDs of each column in the index + std::vector column_oids; + + /** + * @brief - Constructor + */ + HypotheticalIndexObject(){}; + + /** + * @brief - Constructor + */ + HypotheticalIndexObject(oid_t db_oid, oid_t table_oid, oid_t col_oid) + : db_oid(db_oid), table_oid(table_oid) { + column_oids.push_back(col_oid); + } + + /** + * @brief - Constructor + */ + HypotheticalIndexObject(oid_t db_oid, oid_t table_oid, + std::vector &col_oids) + : db_oid(db_oid), table_oid(table_oid), column_oids(col_oids) {} + + /** + * @brief - Equality operator of the index object + */ + bool operator==(const HypotheticalIndexObject &obj) const; + + /** + * @brief - Checks whether the 2 indexes can be merged to make a multi column + * index. Return true if they are in the same database and table, else false + */ + bool IsCompatible(std::shared_ptr index) const; + + /** + * @brief - Merges the 2 index objects to make a multi column index + */ + HypotheticalIndexObject Merge(std::shared_ptr index); + + const std::string ToString() const; +}; + +//===--------------------------------------------------------------------===// +// IndexConfiguration +//===--------------------------------------------------------------------===// + +// Hasher for the IndexObject +struct IndexObjectHasher { + size_t operator()(const HypotheticalIndexObject &obj) const { + return std::hash()(obj.ToString()); + } +}; + +// Call to represent a configuration - a set of hypothetical indexes +class IndexConfiguration { + public: + /** + * @brief - Constructor + */ + IndexConfiguration() {} + + /** + * @brief - Constructor + */ + IndexConfiguration( + std::set> &index_obj_set) + : indexes_(index_obj_set) {} + + /** + * @brief - Merges with the argument configuration + */ + void Merge(IndexConfiguration &config); + + /** + * @brief replace config + */ + void Set(IndexConfiguration &config); + + /** + * @brief - Adds an index into the configuration + */ + void AddIndexObject( + const std::shared_ptr &index_info); + + /** + * @brief - Removes an index from the configuration + */ + void RemoveIndexObject( + const std::shared_ptr &index_info); + + /** + * @brief - Returns the number of indexes in the configuration + */ + size_t GetIndexCount() const; + + /** + * @brief is empty + * @return bool + */ + bool IsEmpty() const; + + /** + * @brief - Returns the indexes in the configuration + */ + const std::set> &GetIndexes() const; + + /** + * @brief - Equality operator of the index configurations + */ + bool operator==(const IndexConfiguration &obj) const; + + /** + * @brief - Set difference of the two configurations + */ + IndexConfiguration operator-(const IndexConfiguration &obj); + + const std::string ToString() const; + + void Clear(); + + private: + // The set of hypothetical indexes in the configuration + std::set> indexes_; +}; + +//===--------------------------------------------------------------------===// +// IndexObjectPool +//===--------------------------------------------------------------------===// + +// This class is a wrapper around a map from the IndexConfiguration to the +// shared pointer of the object. This shared pointer is used else where in the +// the algorithm to identify a configuration - memoization, enumeration, +// equality while sorting etc. +class IndexObjectPool { + public: + /** + * @brief - Constructor + */ + IndexObjectPool() {} + + /** + * @brief - Return the shared pointer of the object from the global + */ + std::shared_ptr GetIndexObject( + HypotheticalIndexObject &obj); + + /** + * @brief - Add the object to the pool of index objects + * if the object already exists, return the shared pointer + * else create the object, add it to the pool and return the shared pointer + */ + std::shared_ptr PutIndexObject( + HypotheticalIndexObject &obj); + + private: + // The mapping from the object to the shared pointer + std::unordered_map, + IndexObjectHasher> map_; +}; + +//===--------------------------------------------------------------------===// +// Workload +//===--------------------------------------------------------------------===// + +// Represents a workload of SQL queries +class Workload { + public: + /** + * @brief - Constructor + */ + Workload(std::string database_name) : database_name(database_name) {} + + /** + * @brief - Initialize a workload with the given query strings. Parse, bind + * and + * add SQLStatements. + */ + Workload(std::vector &queries, std::string database_name, + concurrency::TransactionContext *txn); + + /** + * @brief - Constructor + */ + Workload(std::pair, + std::unordered_set> query, + std::string database_name) + : sql_queries_({query}), database_name(database_name) {} + + /** + * @brief - Add a query into the workload + */ + inline void AddQuery(std::shared_ptr query, + std::unordered_set tables) { + sql_queries_.push_back(std::make_pair(query, tables)); + } + + /** + * @brief - Return the queries + */ + inline const std::vector, + std::unordered_set>> + &GetQueries() { + return sql_queries_; + } + + /** + * @brief - Return the parsed SQLstatements + */ + inline size_t Size() { return sql_queries_.size(); } + + /** + * @brief Return the database name + */ + inline std::string GetDatabaseName() { + PELOTON_ASSERT(database_name != ""); + return database_name; + }; + + /** + * * @brief GetTableNamesReferenced + * Given a parsed & bound query, this function returns all the tables + * referenced. + * @param query - a parsed and bound SQL statement + * @param table_names - where the table names will be stored. + */ + static void GetTableNamesReferenced( + std::shared_ptr query, + std::unordered_set &table_names); + + private: + /** + * Parsed SQL queries along with the referenced table names. + */ + std::vector, + std::unordered_set>> sql_queries_; + std::string database_name; +}; + +} // namespace brain +} // namespace peloton diff --git a/src/include/brain/what_if_index.h b/src/include/brain/what_if_index.h new file mode 100644 index 00000000000..99e1417eb1b --- /dev/null +++ b/src/include/brain/what_if_index.h @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// what_if_index.h +// +// Identification: src/include/brain/what_if_index.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include "brain/index_selection_util.h" +#include "catalog/catalog.h" +#include "catalog/column_catalog.h" +#include "catalog/database_catalog.h" +#include "catalog/index_catalog.h" +#include "catalog/table_catalog.h" +#include "common/internal_types.h" +#include "optimizer/optimizer.h" +#include "parser/postgresparser.h" + +namespace peloton { +namespace brain { + +/** + * @brief Static class to query what-if cost of an index set. + */ +class WhatIfIndex { + public: + /** + * @brief GetCostAndBestPlanTree + * Perform optimization on the given parsed & bound SQL statement and + * return the best physical plan tree and the cost associated with it. + * + * @param query - parsed and bound query + * @param config - a hypothetical index configuration + * @param database_name - database name string + * @param transaction - already created transaction object. + * @return physical plan info + */ + static std::unique_ptr GetCostAndBestPlanTree( + std::shared_ptr query, IndexConfiguration &config, + std::string database_name, concurrency::TransactionContext *txn); + + /** + * @brief GetCostAndBestPlanTree + * Perform optimization on the given parsed & bound SQL statement and + * return the best physical plan tree and the cost associated with it. + * + * Use this when the referenced table names are already known. + * + * @param query + * @param tables_used + * @param config + * @param database_name + * @param txn + * @return + */ + static std::unique_ptr GetCostAndBestPlanTree( + std::pair, + std::unordered_set> query, + IndexConfiguration &config, std::string database_name, + concurrency::TransactionContext *txn); + + private: + /** + * @brief Creates a hypothetical index catalog object, that would be used + * to fill the catalog cache. + * + * @param obj - Index object + * @return index catalog object + */ + static std::shared_ptr CreateIndexCatalogObject( + HypotheticalIndexObject *obj); + /** + * @brief a monotonically increasing sequence number for creating dummy oids + * for the given hypothetical indexes. + */ + static unsigned long index_seq_no; +}; + +} // namespace brain +} // namespace peloton diff --git a/src/include/capnp/peloton_service.capnp b/src/include/capnp/peloton_service.capnp index 80f8c38a171..2e44fa39d6e 100644 --- a/src/include/capnp/peloton_service.capnp +++ b/src/include/capnp/peloton_service.capnp @@ -1,20 +1,28 @@ @0xf3d342883f3f0344; struct CreateIndexRequest { - databaseName @0 :Text; - tableName @1 :Text; + databaseOid @0 :Int32; + tableOid @1 :Int32; - keyAttrs @2 :List(Int32); + keyAttrOids @2 :List(Int32); indexName @3 :Text; uniqueKeys @4 :Bool; - - indexKeys @5 :Int32; } struct CreateIndexResponse { message @0 :Text; } +struct DropIndexRequest { + databaseOid @0 :Int32; + indexOid @1 :Int32; +} + +struct DropIndexResponse { + message @0 :Text; +} + interface PelotonService { createIndex @0 (request :CreateIndexRequest) -> (response :CreateIndexResponse); + dropIndex @1 (request :DropIndexRequest) -> (response :DropIndexResponse); } diff --git a/src/include/catalog/abstract_catalog.h b/src/include/catalog/abstract_catalog.h index e0c8d81df53..15a66b15a99 100644 --- a/src/include/catalog/abstract_catalog.h +++ b/src/include/catalog/abstract_catalog.h @@ -6,7 +6,7 @@ // // Identification: src/include/catalog/abstract_catalog.h // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -67,6 +67,13 @@ class AbstractCatalog { std::vector values, concurrency::TransactionContext *txn) const; + std::unique_ptr>> + GetResultWithIndexScan(const std::vector &column_offsets, + const oid_t &index_offset, + const std::vector &values, + const std::vector &expr_types, + concurrency::TransactionContext *txn) const; + std::unique_ptr>> GetResultWithSeqScan(std::vector column_offsets, expression::AbstractExpression *predicate, diff --git a/src/include/catalog/index_catalog.h b/src/include/catalog/index_catalog.h index 3ece01952b9..6c80b35377d 100644 --- a/src/include/catalog/index_catalog.h +++ b/src/include/catalog/index_catalog.h @@ -6,29 +6,7 @@ // // Identification: src/include/catalog/index_catalog.h // -// Copyright (c) 2015-17, Carnegie Mellon University Index Group -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// pg_index -// -// Schema: (column: column_name) -// 0: index_oid (pkey) -// 1: index_name -// 2: table_oid (which table this index belongs to) -// 3: schema_name (which namespace this index belongs to) -// 4: index_type (default value is BWTREE) -// 5: index_constraint -// 6: unique_keys (is this index supports duplicate keys) -// 7: indexed_attributes (indicate which table columns this index indexes. For -// example a value of 0 2 would mean that the first and the third table columns -// make up the index.) -// -// Indexes: (index offset: indexed columns) -// 0: index_oid (unique & primary key) -// 1: index_name & schema_name (unique) -// 2: table_oid (non-unique) +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -36,6 +14,7 @@ #include "catalog/abstract_catalog.h" #include "executor/logical_tile.h" +#include namespace peloton { namespace catalog { @@ -46,6 +25,11 @@ class IndexCatalogObject { public: IndexCatalogObject(executor::LogicalTile *tile, int tupleId = 0); + // This constructor should only be used for what-if index API. + IndexCatalogObject(oid_t index_oid, std::string index_name, oid_t table_oid, + IndexType index_type, IndexConstraintType index_constraint, + bool unique_keys, std::vector &key_attrs); + inline oid_t GetIndexOid() { return index_oid; } inline const std::string &GetIndexName() { return index_name; } inline oid_t GetTableOid() { return table_oid; } @@ -94,6 +78,14 @@ class IndexCatalog : public AbstractCatalog { const std::string &index_name, const std::string &schema_name, concurrency::TransactionContext *txn); + /** + * Get all the indexes present in the catalog. + * @param txn + * @return Returns vector of index catalog objects. + */ + std::unordered_map> + GetIndexObjects(concurrency::TransactionContext *txn); + private: std::shared_ptr GetIndexObject( oid_t index_oid, concurrency::TransactionContext *txn); diff --git a/src/include/catalog/query_history_catalog.h b/src/include/catalog/query_history_catalog.h index 3f004508d02..8bd7e6608f4 100644 --- a/src/include/catalog/query_history_catalog.h +++ b/src/include/catalog/query_history_catalog.h @@ -10,16 +10,6 @@ // //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// pg_query -// -// Schema: (column offset: column_name) -// 0: query_string -// 1: fingerprint -// 2: timestamp -// -//===----------------------------------------------------------------------===// - #pragma once #include "catalog/abstract_catalog.h" @@ -46,6 +36,10 @@ class QueryHistoryCatalog : public AbstractCatalog { type::AbstractPool *pool, concurrency::TransactionContext *txn); + std::unique_ptr>> + GetQueryStringsAfterTimestamp(const uint64_t start_timestamp, + concurrency::TransactionContext *txn); + enum ColumnId { QUERY_STRING = 0, FINGERPRINT = 1, @@ -57,6 +51,11 @@ class QueryHistoryCatalog : public AbstractCatalog { // Pool to use for variable length strings type::EphemeralPool pool_; + + enum IndexId { + SECONDARY_KEY_0 = 0, + // Add new indexes here in creation order + }; }; } // namespace catalog diff --git a/src/include/catalog/table_catalog.h b/src/include/catalog/table_catalog.h index 0dfc3f51fa9..6d3ed7c1fdb 100644 --- a/src/include/catalog/table_catalog.h +++ b/src/include/catalog/table_catalog.h @@ -6,24 +6,7 @@ // // Identification: src/include/catalog/table_catalog.h // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// pg_table -// -// Schema: (column position: column_name) -// 0: table_oid (pkey) -// 1: table_name, -// 2: schema_name (the namespace name that this table belongs to) -// 3: database_oid -// 4: version_id: for fast ddl(alter table) -// -// Indexes: (index offset: indexed columns) -// 0: table_oid (unique & primary key) -// 1: table_name & schema_name(unique) -// 2: database_oid (non-unique) +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -62,6 +45,11 @@ class TableCatalogObject { std::shared_ptr GetIndexObject( const std::string &index_name, bool cached_only = false); + // Get index objects + bool InsertIndexObject(std::shared_ptr index_object); + bool EvictIndexObject(oid_t index_oid); + bool EvictIndexObject(const std::string &index_name); + // Get columns void EvictAllColumnObjects(); std::unordered_map> @@ -79,6 +67,9 @@ class TableCatalogObject { inline oid_t GetDatabaseOid() { return database_oid; } inline uint32_t GetVersionId() { return version_id; } + // NOTE: should be only used by What-if API. + void SetValidIndexObjects(bool is_valid); + private: // member variables oid_t table_oid; @@ -87,11 +78,6 @@ class TableCatalogObject { oid_t database_oid; uint32_t version_id; - // Get index objects - bool InsertIndexObject(std::shared_ptr index_object); - bool EvictIndexObject(oid_t index_oid); - bool EvictIndexObject(const std::string &index_name); - // Get column objects bool InsertColumnObject(std::shared_ptr column_object); bool EvictColumnObject(oid_t column_id); diff --git a/src/include/index/bwtree.h b/src/include/index/bwtree.h index 4849682ab3c..f9352aad09a 100755 --- a/src/include/index/bwtree.h +++ b/src/include/index/bwtree.h @@ -7585,7 +7585,7 @@ class BwTree : public BwTreeBase { // would always fail, until we have cleaned all epoch nodes current_epoch_p = nullptr; - LOG_DEBUG("Clearing the epoch in ~EpochManager()..."); + LOG_TRACE("Clearing the epoch in ~EpochManager()..."); // If all threads has exited then all thread counts are // 0, and therefore this should proceed way to the end diff --git a/src/include/network/peloton_rpc_handler_task.h b/src/include/network/peloton_rpc_handler_task.h index 8abfa510af4..e1de4a4dcc2 100644 --- a/src/include/network/peloton_rpc_handler_task.h +++ b/src/include/network/peloton_rpc_handler_task.h @@ -11,24 +11,199 @@ //===----------------------------------------------------------------------===// #pragma once +#include #include "capnp/ez-rpc.h" #include "capnp/message.h" +#include "catalog/catalog.h" #include "common/dedicated_thread_task.h" #include "common/logger.h" +#include "common/internal_types.h" #include "kj/debug.h" #include "peloton/capnp/peloton_service.capnp.h" +#include "codegen/buffering_consumer.h" +#include "executor/executor_context.h" +#include "planner/populate_index_plan.h" +#include "storage/storage_manager.h" +#include "planner/seq_scan_plan.h" +#include "catalog/system_catalogs.h" +#include "catalog/column_catalog.h" +#include "binder/bind_node_visitor.h" +#include "catalog/catalog.h" +#include "common/logger.h" +#include "concurrency/transaction_manager_factory.h" +#include "executor/plan_executor.h" +#include "gmock/gtest/gtest.h" +#include "optimizer/optimizer.h" +#include "optimizer/rule.h" +#include "parser/postgresparser.h" +#include "planner/plan_util.h" +#include "optimizer/stats/stats_storage.h" +#include "traffic_cop/traffic_cop.h" namespace peloton { namespace network { class PelotonRpcServerImpl final : public PelotonService::Server { + private: + static std::atomic_int counter_; + protected: - kj::Promise createIndex(CreateIndexContext) override { - // TODO(tianyu) Write actual index code - LOG_DEBUG("Received rpc to create index"); + kj::Promise dropIndex(DropIndexContext request) override { + auto database_oid = request.getParams().getRequest().getDatabaseOid(); + auto index_oid = request.getParams().getRequest().getIndexOid(); + LOG_TRACE("Database oid: %d", database_oid); + LOG_TRACE("Index oid: %d", index_oid); + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Drop index. Fail if it doesn't exist. + auto catalog = catalog::Catalog::GetInstance(); + try { + catalog->DropIndex(database_oid, index_oid, txn); + } catch (CatalogException e) { + LOG_ERROR("Drop Index Failed"); + txn_manager.AbortTransaction(txn); + return kj::NEVER_DONE; + } + txn_manager.CommitTransaction(txn); return kj::READY_NOW; } -}; + kj::Promise createIndex(CreateIndexContext request) override { + LOG_DEBUG("Received RPC to create index"); + + auto database_oid = request.getParams().getRequest().getDatabaseOid(); + auto table_oid = request.getParams().getRequest().getTableOid(); + auto col_oids = request.getParams().getRequest().getKeyAttrOids(); + auto index_name = request.getParams().getRequest().getIndexName(); + + std::vector col_oid_vector; + LOG_DEBUG("Database oid: %d", database_oid); + LOG_DEBUG("Table oid: %d", table_oid); + for (auto col : col_oids) { + LOG_DEBUG("Col oid: %d", col); + col_oid_vector.push_back(col); + } + + // Create transaction to query the catalog. + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Get the existing table so that we can find its oid and the cols oids. + std::shared_ptr table_object; + try { + table_object = catalog::Catalog::GetInstance()->GetTableObject( + database_oid, table_oid, txn); + } catch (CatalogException e) { + LOG_ERROR("Exception ocurred while getting table: %s", + e.GetMessage().c_str()); + PELOTON_ASSERT(false); + } + + auto table_name = table_object->GetTableName(); + auto col_obj_pairs = table_object->GetColumnObjects(); + + // Done with the transaction. + txn_manager.CommitTransaction(txn); + + // Get all the column names from the oids. + std::vector column_names; + for (auto col_oid : col_oid_vector) { + auto found_itr = col_obj_pairs.find(col_oid); + if (found_itr != col_obj_pairs.end()) { + auto col_obj = found_itr->second; + column_names.push_back(col_obj->GetColumnName()); + } else { + PELOTON_ASSERT(false); + } + } + + // Create "CREATE INDEX" query. + std::ostringstream oss; + oss << "CREATE INDEX " << index_name.cStr() << " ON "; + oss << table_name << "("; + for (auto i = 0UL; i < column_names.size(); i++) { + oss << column_names[i]; + if (i < (column_names.size() - 1)) { + oss << ","; + } + } + oss << ")"; + + LOG_DEBUG("Executing Create Index Query: %s", oss.str().c_str()); + + // Execute the SQL query + std::vector result; + std::vector tuple_descriptor; + std::string error_message; + int rows_affected; + + ExecuteSQLQuery(oss.str(), result, tuple_descriptor, rows_affected, + error_message); + LOG_INFO("Execute query done"); + + return kj::READY_NOW; + } + + static void UtilTestTaskCallback(void *arg) { + std::atomic_int *count = static_cast(arg); + count->store(0); + } + + // TODO: Avoid using this function. + // Copied from SQL testing util. + // Execute a SQL query end-to-end + ResultType ExecuteSQLQuery(const std::string query, + std::vector &result, + std::vector &tuple_descriptor, + int &rows_changed, std::string &error_message) { + std::atomic_int counter_; + + LOG_INFO("Query: %s", query.c_str()); + // prepareStatement + std::string unnamed_statement = "unnamed"; + auto &peloton_parser = parser::PostgresParser::GetInstance(); + auto sql_stmt_list = peloton_parser.BuildParseTree(query); + PELOTON_ASSERT(sql_stmt_list); + if (!sql_stmt_list->is_valid) { + return ResultType::FAILURE; + } + + tcop::TrafficCop traffic_cop_(UtilTestTaskCallback, &counter_); + + auto statement = traffic_cop_.PrepareStatement(unnamed_statement, query, + std::move(sql_stmt_list)); + if (statement.get() == nullptr) { + traffic_cop_.setRowsAffected(0); + rows_changed = 0; + error_message = traffic_cop_.GetErrorMessage(); + return ResultType::FAILURE; + } + // Execute Statement + std::vector param_values; + bool unnamed = false; + std::vector result_format(statement->GetTupleDescriptor().size(), 0); + // SetTrafficCopCounter(); + counter_.store(1); + auto status = traffic_cop_.ExecuteStatement( + statement, param_values, unnamed, nullptr, result_format, result); + if (traffic_cop_.GetQueuing()) { + while (counter_.load() == 1) { + usleep(10); + } + traffic_cop_.ExecuteStatementPlanGetResult(); + status = traffic_cop_.ExecuteStatementGetResult(); + traffic_cop_.SetQueuing(false); + } + if (status == ResultType::SUCCESS) { + tuple_descriptor = statement->GetTupleDescriptor(); + } + LOG_INFO("Statement executed. Result: %s", + ResultTypeToString(status).c_str()); + rows_changed = traffic_cop_.getRowsAffected(); + return status; + } +}; class PelotonRpcHandlerTask : public DedicatedThreadTask { public: diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index 82b1d4c9a05..8b4c89c0509 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -15,15 +15,15 @@ #include #include "optimizer/abstract_optimizer.h" -#include "optimizer/property_set.h" #include "optimizer/optimizer_metadata.h" +#include "optimizer/property_set.h" namespace peloton { namespace parser { class SQLStatementList; class SQLStatement; -} +} // namespace parser namespace planner { class AbstractPlan; @@ -38,9 +38,9 @@ class TransactionContext; } namespace test { - class OptimizerRuleTests_SimpleAssociativeRuleTest_Test; - class OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; -} +class OptimizerRuleTests_SimpleAssociativeRuleTest_Test; +class OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; +} // namespace test namespace optimizer { @@ -53,6 +53,12 @@ struct QueryInfo { std::shared_ptr physical_props; }; +struct OptimizerPlanInfo { + OptimizerPlanInfo(){}; + std::unique_ptr plan; + double cost; +}; + //===--------------------------------------------------------------------===// // Optimizer //===--------------------------------------------------------------------===// @@ -60,8 +66,10 @@ class Optimizer : public AbstractOptimizer { friend class BindingIterator; friend class GroupBindingIterator; - friend class ::peloton::test::OptimizerRuleTests_SimpleAssociativeRuleTest_Test; - friend class ::peloton::test::OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; + friend class ::peloton::test:: + OptimizerRuleTests_SimpleAssociativeRuleTest_Test; + friend class ::peloton::test:: + OptimizerRuleTests_SimpleAssociativeRuleTest2_Test; public: Optimizer(const Optimizer &) = delete; @@ -75,6 +83,11 @@ class Optimizer : public AbstractOptimizer { const std::unique_ptr &parse_tree_list, concurrency::TransactionContext *txn) override; + // Used by What-if API + std::unique_ptr GetOptimizedPlanInfo( + std::shared_ptr parsed_statement, + concurrency::TransactionContext *txn); + void OptimizeLoop(int root_group_id, std::shared_ptr required_props); @@ -83,13 +96,13 @@ class Optimizer : public AbstractOptimizer { OptimizerMetadata &GetMetadata() { return metadata_; } /* For test purposes only */ - std::shared_ptr TestInsertQueryTree(parser::SQLStatement *tree, - concurrency::TransactionContext *txn) { + std::shared_ptr TestInsertQueryTree( + parser::SQLStatement *tree, concurrency::TransactionContext *txn) { return InsertQueryTree(tree, txn); } /* For test purposes only */ void TestExecuteTaskStack(OptimizerTaskStack &task_stack, int root_group_id, - std::shared_ptr root_context) { + std::shared_ptr root_context) { return ExecuteTaskStack(task_stack, root_group_id, root_context); } diff --git a/src/include/optimizer/stats_calculator.h b/src/include/optimizer/stats_calculator.h index 5aed2902671..ef4654812dd 100644 --- a/src/include/optimizer/stats_calculator.h +++ b/src/include/optimizer/stats_calculator.h @@ -2,11 +2,11 @@ // // Peloton // -// cost_and_stats_calculator.h +// stats_calculator.h // // Identification: src/include/optimizer/stats_calculator.h // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -26,8 +26,8 @@ class TableStats; */ class StatsCalculator : public OperatorVisitor { public: - void CalculateStats(GroupExpression *gexpr, ExprSet required_cols, - Memo *memo, concurrency::TransactionContext* txn); + void CalculateStats(GroupExpression *gexpr, ExprSet required_cols, Memo *memo, + concurrency::TransactionContext *txn); void Visit(const LogicalGet *) override; void Visit(const LogicalQueryDerivedGet *) override; @@ -68,14 +68,10 @@ class StatsCalculator : public OperatorVisitor { &predicate_stats, const std::vector &predicates); - double CalculateSelectivityForPredicate( - const std::shared_ptr predicate_table_stats, - const expression::AbstractExpression *expr); - GroupExpression *gexpr_; ExprSet required_cols_; Memo *memo_; - concurrency::TransactionContext* txn_; + concurrency::TransactionContext *txn_; }; } // namespace optimizer diff --git a/src/include/optimizer/util.h b/src/include/optimizer/util.h index 8b9eb4baeef..6a57086a0d0 100644 --- a/src/include/optimizer/util.h +++ b/src/include/optimizer/util.h @@ -6,7 +6,7 @@ // // Identification: src/include/optimizer/util.h // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -17,6 +17,7 @@ #include #include "expression/abstract_expression.h" +#include "optimizer/stats/table_stats.h" #include "parser/copy_statement.h" #include "planner/abstract_plan.h" @@ -33,11 +34,11 @@ class DataTable; namespace optimizer { namespace util { - /** - * @brief Convert upper case letters into lower case in a string - * - * @param str The string to operate on - */ +/** + * @brief Convert upper case letters into lower case in a string + * + * @param str The string to operate on + */ inline void to_lower_string(std::string &str) { std::transform(str.begin(), str.end(), str.begin(), ::tolower); } @@ -110,7 +111,6 @@ expression::AbstractExpression *ConstructJoinPredicate( std::unordered_set &table_alias_set, MultiTablePredicates &join_predicates); - /** * @breif Check if there are any join columns in the join expression * For example, expr = (expr_1) AND (expr_2) AND (expr_3) @@ -167,6 +167,18 @@ void ExtractEquiJoinKeys( const std::unordered_set &left_alias, const std::unordered_set &right_alias); +/** + * @brief Calculate selectivity after applying predicates on a table + * + * @param predicate_table_stats the incoming table stats + * @param expr the predicate + * + * @return updated selectivity + */ +double CalculateSelectivityForPredicate( + const std::shared_ptr predicate_table_stats, + const expression::AbstractExpression *expr); + } // namespace util } // namespace optimizer } // namespace peloton diff --git a/src/main/peloton/peloton.cpp b/src/main/peloton/peloton.cpp index 8c5e0b204c6..646b4d5c2df 100644 --- a/src/main/peloton/peloton.cpp +++ b/src/main/peloton/peloton.cpp @@ -6,7 +6,7 @@ // // Identification: src/main/peloton/peloton.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -18,6 +18,7 @@ #include "network/peloton_server.h" #include "settings/settings_manager.h" #include "brain/brain.h" +#include "brain/index_selection_job.h" // For GFlag's built-in help message flag DECLARE_bool(help); @@ -35,8 +36,9 @@ int RunPelotonServer() { peloton_server.SetupServer().ServerLoop(); } catch (peloton::ConnectionException &exception) { - //log error message and mark failure - peloton::LOG_ERROR("Cannot start server. Failure detail : %s\n", exception.GetMessage().c_str()); + // log error message and mark failure + peloton::LOG_ERROR("Cannot start server. Failure detail : %s\n", + exception.GetMessage().c_str()); return_code = EXIT_FAILURE; } @@ -45,32 +47,26 @@ int RunPelotonServer() { return return_code; } - int RunPelotonBrain() { // TODO(tianyu): boot up other peloton resources as needed here peloton::brain::Brain brain; evthread_use_pthreads(); // TODO(tianyu): register jobs here - struct timeval one_second; - one_second.tv_sec = 1; - one_second.tv_usec = 0; - - auto example_task = [](peloton::brain::BrainEnvironment *) { - // TODO(tianyu): Replace with real address - capnp::EzRpcClient client("localhost:15445"); - PelotonService::Client peloton_service = client.getMain(); - auto request = peloton_service.createIndexRequest(); - request.getRequest().setIndexKeys(42); - auto response = request.send().wait(client.getWaitScope()); - }; - - brain.RegisterJob(&one_second, "test", example_task); + struct timeval one_minute; + one_minute.tv_sec = 10; + one_minute.tv_usec = 0; + + // The handler for the Index Suggestion related RPC calls to create/drop + // indexes + // TODO[vamshi]: Remove this hard coding + auto num_queries_threshold = 2; + brain.RegisterJob( + &one_minute, "index_suggestion", num_queries_threshold); brain.Run(); return 0; } int main(int argc, char *argv[]) { - // Parse the command line flags ::google::ParseCommandLineNonHelpFlags(&argc, &argv, true); @@ -83,20 +79,22 @@ int main(int argc, char *argv[]) { try { // Print settings if (peloton::settings::SettingsManager::GetBool( - peloton::settings::SettingId::display_settings)) { + peloton::settings::SettingId::display_settings)) { auto &settings = peloton::settings::SettingsManager::GetInstance(); settings.ShowInfo(); } } catch (peloton::SettingsException &exception) { - peloton::LOG_ERROR("Cannot load settings. Failed with %s\n", exception.GetMessage().c_str()); - return EXIT_FAILURE; // TODO: Use an enum with exit error codes + peloton::LOG_ERROR("Cannot load settings. Failed with %s\n", + exception.GetMessage().c_str()); + return EXIT_FAILURE; // TODO: Use an enum with exit error codes } int exit_code = 0; if (peloton::settings::SettingsManager::GetBool( - peloton::settings::SettingId::brain)) - exit_code = RunPelotonBrain(); + peloton::settings::SettingId::brain)) + exit_code = RunPelotonBrain(); else exit_code = RunPelotonServer(); + return exit_code; } diff --git a/src/optimizer/cost_calculator.cpp b/src/optimizer/cost_calculator.cpp index 5dda9e67c8a..38193e453fc 100644 --- a/src/optimizer/cost_calculator.cpp +++ b/src/optimizer/cost_calculator.cpp @@ -2,11 +2,11 @@ // // Peloton // -// cost_and_stats_calculator.h +// cost_calculator.cpp // // Identification: src/optimizer/cost_calculator.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -14,7 +14,10 @@ #include +#include "catalog/column_catalog.h" #include "catalog/table_catalog.h" +#include "catalog/index_catalog.h" +#include "expression/tuple_value_expression.h" #include "optimizer/memo.h" #include "optimizer/operators.h" #include "optimizer/stats/cost.h" @@ -50,14 +53,73 @@ void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalIndexScan *op) { auto table_stats = std::dynamic_pointer_cast( StatsStorage::GetInstance()->GetTableStats( op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_)); - if (table_stats->GetColumnCount() == 0 || table_stats->num_rows == 0) { + auto index_scan_rows = (double)table_stats->num_rows; + if (table_stats->GetColumnCount() == 0 || index_scan_rows == 0) { output_cost_ = 0.f; return; } + auto index_object = op->table_->GetIndexObject(op->index_id); + const auto &key_attr_list = index_object->GetKeyAttrs(); + // Loop over index to retrieve helpful index columns + // Consider all predicates that could be accelerated by the index, + // i.e. till the first column with no equality predicate on it + // index cols (a, b, c) + // example1 : predicates(a=1 AND b=2 AND c=3) index helps on both a, b and c + // example2 : predicates(a<1 AND b<=2 and c<3) index helps on only a + // example3 : predicates(a=1 AND b>2 AND c>3) index helps on a and b + bool has_non_equality_pred = false; + for (size_t idx = 0; idx < key_attr_list.size(); ++idx) { + // If index cannot further reduce scan range, break + if (idx == op->key_column_id_list.size() || + key_attr_list[idx] != op->key_column_id_list[idx]) { + break; + } + auto index_col_id = key_attr_list[idx]; + // Find the predicate and update scan rows accordingly + for (auto &predicate : op->predicates) { + auto &expr = predicate.expr; + // TODO(boweic): support non equality predicates + if (expr->GetExpressionType() != ExpressionType::COMPARE_EQUAL) { + has_non_equality_pred = true; + } + expression::AbstractExpression *tv_expr = nullptr; + if (expr->GetChild(0)->GetExpressionType() == + ExpressionType::VALUE_TUPLE) { + auto r_type = expr->GetChild(1)->GetExpressionType(); + if (r_type == ExpressionType::VALUE_CONSTANT || + r_type == ExpressionType::VALUE_PARAMETER) { + tv_expr = expr->GetModifiableChild(0); + } + } + if (expr->GetChild(1)->GetExpressionType() == + ExpressionType::VALUE_TUPLE) { + auto r_type = expr->GetChild(0)->GetExpressionType(); + if (r_type == ExpressionType::VALUE_CONSTANT || + r_type == ExpressionType::VALUE_PARAMETER) { + tv_expr = expr->GetModifiableChild(1); + } + } + if (tv_expr == nullptr) { + continue; + } + auto column_ref = + reinterpret_cast(tv_expr); + auto column_id = op->table_->GetColumnObject(column_ref->GetColumnName()) + ->GetColumnId(); + if (column_id != index_col_id) { + continue; + } + // update selectivity here + index_scan_rows *= + util::CalculateSelectivityForPredicate(table_stats, expr.get()); + } + if (has_non_equality_pred) { + break; + } + } // Index search cost + scan cost output_cost_ = std::log2(table_stats->num_rows) * DEFAULT_INDEX_TUPLE_COST + - memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() * - DEFAULT_TUPLE_COST; + index_scan_rows * DEFAULT_TUPLE_COST; } void CostCalculator::Visit(UNUSED_ATTRIBUTE const QueryDerivedScan *op) { output_cost_ = 0.f; @@ -88,7 +150,8 @@ void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) { memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows(); auto right_child_rows = memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows(); - // TODO(boweic): Build (left) table should have different cost to probe table + // TODO(boweic): Build (left) table should have different cost to probe + // table output_cost_ = (left_child_rows + right_child_rows) * DEFAULT_TUPLE_COST; } void CostCalculator::Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) {} diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index 62f813ec876..2152eae5614 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -6,7 +6,7 @@ // // Identification: src/optimizer/optimizer.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -21,16 +21,16 @@ #include "common/exception.h" #include "optimizer/binding.h" -#include "optimizer/input_column_deriver.h" #include "optimizer/operator_visitor.h" -#include "optimizer/optimize_context.h" -#include "optimizer/optimizer_task_pool.h" -#include "optimizer/plan_generator.h" #include "optimizer/properties.h" #include "optimizer/property_enforcer.h" #include "optimizer/query_to_operator_transformer.h" +#include "optimizer/input_column_deriver.h" +#include "optimizer/plan_generator.h" #include "optimizer/rule.h" #include "optimizer/rule_impls.h" +#include "optimizer/optimizer_task_pool.h" +#include "optimizer/optimize_context.h" #include "parser/create_statement.h" #include "planner/analyze_plan.h" @@ -113,7 +113,8 @@ shared_ptr Optimizer::BuildPelotonPlanTree( // Generate initial operator tree from query tree shared_ptr gexpr = InsertQueryTree(parse_tree, txn); GroupID root_id = gexpr->GetGroupID(); - // Get the physical properties the final plan must output + + // Get the physical properties and projected columns the final plan must have auto query_info = GetQueryInfo(parse_tree); try { @@ -136,6 +137,54 @@ shared_ptr Optimizer::BuildPelotonPlanTree( } } +// GetOptimizedQueryTree() +// Return an optimized physical query tree for the given parse tree along +// with the cost. +std::unique_ptr Optimizer::GetOptimizedPlanInfo( + std::shared_ptr parsed_statement, + concurrency::TransactionContext *txn) { + metadata_.txn = txn; + + // Generate initial operator tree to work with from the parsed + // statement object. + std::shared_ptr g_expr = + InsertQueryTree(parsed_statement.get(), txn); + GroupID root_id = g_expr->GetGroupID(); + + // Get the physical properties of the final plan that must be enforced + auto query_info = GetQueryInfo(parsed_statement.get()); + + // Start with the base expression and explore all the possible transformations + // and add them to the local context. + try { + OptimizeLoop(root_id, query_info.physical_props); + } catch (OptimizerException &e) { + LOG_WARN("Optimize Loop ended prematurely: %s", e.what()); + PELOTON_ASSERT(false); + } + + try { + auto best_plan = ChooseBestPlan(root_id, query_info.physical_props, + query_info.output_exprs); + auto info_obj = std::unique_ptr(new OptimizerPlanInfo()); + + // Get the cost. + auto group = GetMetadata().memo.GetGroupByID(root_id); + auto best_expr = group->GetBestExpression(query_info.physical_props); + + info_obj->cost = best_expr->GetCost(query_info.physical_props); + info_obj->plan = std::move(best_plan); + + // Reset memo after finishing the optimization + Reset(); + + return info_obj; + } catch (Exception &e) { + Reset(); + throw e; + } +} + void Optimizer::Reset() { metadata_ = OptimizerMetadata(); } unique_ptr Optimizer::HandleDDLStatement( @@ -238,29 +287,29 @@ shared_ptr Optimizer::InsertQueryTree( } QueryInfo Optimizer::GetQueryInfo(parser::SQLStatement *tree) { - auto GetQueryInfoHelper = - [](std::vector> &select_list, - std::unique_ptr &order_info, - std::vector &output_exprs, - std::shared_ptr &physical_props) { - // Extract output column - for (auto &expr : select_list) output_exprs.push_back(expr.get()); - - // Extract sort property - if (order_info != nullptr) { - std::vector sort_exprs; - std::vector sort_ascending; - for (auto &expr : order_info->exprs) { - sort_exprs.push_back(expr.get()); - } - for (auto &type : order_info->types) { - sort_ascending.push_back(type == parser::kOrderAsc); - } - if (!sort_exprs.empty()) - physical_props->AddProperty( - std::make_shared(sort_exprs, sort_ascending)); - } - }; + auto GetQueryInfoHelper = []( + std::vector> &select_list, + std::unique_ptr &order_info, + std::vector &output_exprs, + std::shared_ptr &physical_props) { + // Extract output column + for (auto &expr : select_list) output_exprs.push_back(expr.get()); + + // Extract sort property + if (order_info != nullptr) { + std::vector sort_exprs; + std::vector sort_ascending; + for (auto &expr : order_info->exprs) { + sort_exprs.push_back(expr.get()); + } + for (auto &type : order_info->types) { + sort_ascending.push_back(type == parser::kOrderAsc); + } + if (!sort_exprs.empty()) + physical_props->AddProperty( + std::make_shared(sort_exprs, sort_ascending)); + } + }; std::vector output_exprs; std::shared_ptr physical_props = std::make_shared(); @@ -278,8 +327,7 @@ QueryInfo Optimizer::GetQueryInfo(parser::SQLStatement *tree) { output_exprs, physical_props); break; } - default: - ; + default:; } return QueryInfo(output_exprs, physical_props); diff --git a/src/optimizer/rule_impls.cpp b/src/optimizer/rule_impls.cpp index e540555c9e3..1eca5cd1d72 100644 --- a/src/optimizer/rule_impls.cpp +++ b/src/optimizer/rule_impls.cpp @@ -6,7 +6,7 @@ // // Identification: src/optimizer/rule_impls.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -313,9 +313,8 @@ void GetToIndexScan::Transform( // Check whether any index can fulfill predicate predicate evaluation if (!get->predicates.empty()) { - std::vector key_column_id_list; - std::vector expr_type_list; - std::vector value_list; + std::unordered_map> + type_value_pair_by_key_id; for (auto &pred : get->predicates) { auto expr = pred.expr.get(); if (expr->GetChildrenSize() != 2) continue; @@ -352,29 +351,26 @@ void GetToIndexScan::Transform( std::string col_name(column_ref->GetColumnName()); LOG_TRACE("Column name: %s", col_name.c_str()); auto column_id = get->table->GetColumnObject(col_name)->GetColumnId(); - key_column_id_list.push_back(column_id); - expr_type_list.push_back(expr_type); - + type::Value value; if (value_expr->GetExpressionType() == ExpressionType::VALUE_CONSTANT) { - value_list.push_back( - reinterpret_cast( - value_expr) - ->GetValue()); + value = reinterpret_cast( + value_expr) + ->GetValue(); LOG_TRACE("Value Type: %d", static_cast( reinterpret_cast( expr->GetModifiableChild(1)) ->GetValueType())); } else { - value_list.push_back( - type::ValueFactory::GetParameterOffsetValue( - reinterpret_cast( - value_expr) - ->GetValueIdx()) - .Copy()); + value = type::ValueFactory::GetParameterOffsetValue( + reinterpret_cast( + value_expr) + ->GetValueIdx()) + .Copy(); LOG_TRACE("Parameter offset: %s", (*value_list.rbegin()).GetInfo().c_str()); } + type_value_pair_by_key_id[column_id] = {expr_type, value}; } } // Loop predicates end @@ -389,12 +385,19 @@ void GetToIndexScan::Transform( std::unordered_set index_col_set( index_object->GetKeyAttrs().begin(), index_object->GetKeyAttrs().end()); - for (size_t offset = 0; offset < key_column_id_list.size(); offset++) { - auto col_id = key_column_id_list[offset]; - if (index_col_set.find(col_id) != index_col_set.end()) { - index_key_column_id_list.push_back(col_id); - index_expr_type_list.push_back(expr_type_list[offset]); - index_value_list.push_back(value_list[offset]); + // If the first index key column present in the predicate's column id map + // then we would let the cost model to decide if we want to use the index + const auto &key_attr_list = index_object->GetKeyAttrs(); + if (!key_attr_list.empty() && + type_value_pair_by_key_id.count(key_attr_list[0])) { + for (const auto &key_col_oid : key_attr_list) { + if (type_value_pair_by_key_id.count(key_col_oid)) { + const auto &type_value_pair = + type_value_pair_by_key_id[key_col_oid]; + index_key_column_id_list.push_back(key_col_oid); + index_expr_type_list.push_back(type_value_pair.first); + index_value_list.push_back(type_value_pair.second); + } } } // Add transformed plan diff --git a/src/optimizer/stats/selectivity.cpp b/src/optimizer/stats/selectivity.cpp index 474ae1a71da..0586ad31eb9 100644 --- a/src/optimizer/stats/selectivity.cpp +++ b/src/optimizer/stats/selectivity.cpp @@ -6,7 +6,7 @@ // // Identification: src/optimizer/stats/selectivity.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -84,7 +84,7 @@ double Selectivity::Equal(const std::shared_ptr &table_stats, auto column_stats = table_stats->GetColumnStats(condition.column_name); // LOG_INFO("column name %s", condition.column_name); if (std::isnan(value) || column_stats == nullptr) { - LOG_DEBUG("Calculate selectivity: return null"); + LOG_TRACE("Calculate selectivity: return null"); return DEFAULT_SELECTIVITY; } diff --git a/src/optimizer/stats_calculator.cpp b/src/optimizer/stats_calculator.cpp index 3cdb34c4d9d..f9d5685a3c3 100644 --- a/src/optimizer/stats_calculator.cpp +++ b/src/optimizer/stats_calculator.cpp @@ -2,11 +2,11 @@ // // Peloton // -// cost_and_stats_calculator.h +// stats_calculator.cpp // // Identification: src/optimizer/stats_calculator.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -42,8 +42,8 @@ void StatsCalculator::Visit(const LogicalGet *op) { return; } auto table_stats = std::dynamic_pointer_cast( - StatsStorage::GetInstance()->GetTableStats(op->table->GetDatabaseOid(), - op->table->GetTableOid(), txn_)); + StatsStorage::GetInstance()->GetTableStats( + op->table->GetDatabaseOid(), op->table->GetTableOid(), txn_)); // First, get the required stats of the base table std::unordered_map> required_stats; for (auto &col : required_cols_) { @@ -143,7 +143,8 @@ void StatsCalculator::Visit(const LogicalInnerJoin *op) { column_stats = std::make_shared( *left_child_group->GetStats(tv_expr->GetColFullName())); } else { - PELOTON_ASSERT(right_child_group->HasColumnStats(tv_expr->GetColFullName())); + PELOTON_ASSERT( + right_child_group->HasColumnStats(tv_expr->GetColFullName())); column_stats = std::make_shared( *right_child_group->GetStats(tv_expr->GetColFullName())); } @@ -251,96 +252,12 @@ void StatsCalculator::UpdateStatsForFilter( double selectivity = 1.f; for (auto &annotated_expr : predicates) { // Loop over conjunction exprs - selectivity *= CalculateSelectivityForPredicate(predicate_table_stats, - annotated_expr.expr.get()); + selectivity *= util::CalculateSelectivityForPredicate( + predicate_table_stats, annotated_expr.expr.get()); } // Update selectivity memo_->GetGroupByID(gexpr_->GetGroupID())->SetNumRows(num_rows * selectivity); } -// Calculate the selectivity given the predicate and the stats of columns in the -// predicate -double StatsCalculator::CalculateSelectivityForPredicate( - const std::shared_ptr predicate_table_stats, - const expression::AbstractExpression *expr) { - double selectivity = 1.f; - if (predicate_table_stats->GetColumnCount() == 0 || - predicate_table_stats->GetColumnStats(0)->num_rows == 0) { - return selectivity; - } - // Base case : Column Op Val - if ((expr->GetChild(0)->GetExpressionType() == ExpressionType::VALUE_TUPLE && - (expr->GetChild(1)->GetExpressionType() == - ExpressionType::VALUE_CONSTANT || - expr->GetChild(1)->GetExpressionType() == - ExpressionType::VALUE_PARAMETER)) || - (expr->GetChild(1)->GetExpressionType() == ExpressionType::VALUE_TUPLE && - (expr->GetChild(0)->GetExpressionType() == - ExpressionType::VALUE_CONSTANT || - expr->GetChild(0)->GetExpressionType() == - ExpressionType::VALUE_PARAMETER))) { - int right_index = - expr->GetChild(0)->GetExpressionType() == ExpressionType::VALUE_TUPLE - ? 1 - : 0; - - auto left_expr = expr->GetChild(1 - right_index); - PELOTON_ASSERT(left_expr->GetExpressionType() == ExpressionType::VALUE_TUPLE); - auto col_name = - reinterpret_cast(left_expr) - ->GetColFullName(); - - auto expr_type = expr->GetExpressionType(); - if (right_index == 0) { - switch (expr_type) { - case ExpressionType::COMPARE_LESSTHANOREQUALTO: - expr_type = ExpressionType::COMPARE_GREATERTHANOREQUALTO; - break; - case ExpressionType::COMPARE_LESSTHAN: - expr_type = ExpressionType::COMPARE_GREATERTHAN; - break; - case ExpressionType::COMPARE_GREATERTHANOREQUALTO: - expr_type = ExpressionType::COMPARE_LESSTHANOREQUALTO; - break; - case ExpressionType::COMPARE_GREATERTHAN: - expr_type = ExpressionType::COMPARE_LESSTHAN; - break; - default: - break; - } - } - - type::Value value; - if (expr->GetChild(right_index)->GetExpressionType() == - ExpressionType::VALUE_CONSTANT) { - value = reinterpret_cast( - expr->GetModifiableChild(right_index)) - ->GetValue(); - } else { - value = type::ValueFactory::GetParameterOffsetValue( - reinterpret_cast( - expr->GetModifiableChild(right_index)) - ->GetValueIdx()) - .Copy(); - } - ValueCondition condition(col_name, expr_type, value); - selectivity = - Selectivity::ComputeSelectivity(predicate_table_stats, condition); - } else if (expr->GetExpressionType() == ExpressionType::CONJUNCTION_AND || - expr->GetExpressionType() == ExpressionType::CONJUNCTION_OR) { - double left_selectivity = CalculateSelectivityForPredicate( - predicate_table_stats, expr->GetChild(0)); - double right_selectivity = CalculateSelectivityForPredicate( - predicate_table_stats, expr->GetChild(1)); - if (expr->GetExpressionType() == ExpressionType::CONJUNCTION_AND) { - selectivity = left_selectivity * right_selectivity; - } else { - selectivity = left_selectivity + right_selectivity - - left_selectivity * right_selectivity; - } - } - return selectivity; -} - } // namespace optimizer } // namespace peloton diff --git a/src/optimizer/util.cpp b/src/optimizer/util.cpp index 0d01e35e8ac..86f8c2f2862 100644 --- a/src/optimizer/util.cpp +++ b/src/optimizer/util.cpp @@ -6,7 +6,7 @@ // // Identification: src/optimizer/util.cpp // -// Copyright (c) 2015-16, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -15,6 +15,7 @@ #include "catalog/query_metrics_catalog.h" #include "concurrency/transaction_manager_factory.h" #include "expression/expression_util.h" +#include "optimizer/stats/selectivity.h" #include "planner/copy_plan.h" #include "planner/seq_scan_plan.h" #include "storage/data_table.h" @@ -179,8 +180,7 @@ std::unordered_map> ConstructSelectElementMap( std::vector> &select_list) { std::unordered_map> - res; + std::shared_ptr> res; for (auto &expr : select_list) { std::string alias; if (!expr->alias.empty()) { @@ -250,6 +250,89 @@ void ExtractEquiJoinKeys( } } +// Calculate the selectivity given the predicate and the stats of columns in the +// predicate +double CalculateSelectivityForPredicate( + const std::shared_ptr predicate_table_stats, + const expression::AbstractExpression *expr) { + double selectivity = 1.f; + if (predicate_table_stats->GetColumnCount() == 0 || + predicate_table_stats->GetColumnStats(0)->num_rows == 0) { + return selectivity; + } + // Base case : Column Op Val + if ((expr->GetChild(0)->GetExpressionType() == ExpressionType::VALUE_TUPLE && + (expr->GetChild(1)->GetExpressionType() == + ExpressionType::VALUE_CONSTANT || + expr->GetChild(1)->GetExpressionType() == + ExpressionType::VALUE_PARAMETER)) || + (expr->GetChild(1)->GetExpressionType() == ExpressionType::VALUE_TUPLE && + (expr->GetChild(0)->GetExpressionType() == + ExpressionType::VALUE_CONSTANT || + expr->GetChild(0)->GetExpressionType() == + ExpressionType::VALUE_PARAMETER))) { + int right_index = + expr->GetChild(0)->GetExpressionType() == ExpressionType::VALUE_TUPLE + ? 1 + : 0; + + auto left_expr = expr->GetChild(1 - right_index); + auto col_name = + reinterpret_cast(left_expr) + ->GetColFullName(); + + auto expr_type = expr->GetExpressionType(); + if (right_index == 0) { + switch (expr_type) { + case ExpressionType::COMPARE_LESSTHANOREQUALTO: + expr_type = ExpressionType::COMPARE_GREATERTHANOREQUALTO; + break; + case ExpressionType::COMPARE_LESSTHAN: + expr_type = ExpressionType::COMPARE_GREATERTHAN; + break; + case ExpressionType::COMPARE_GREATERTHANOREQUALTO: + expr_type = ExpressionType::COMPARE_LESSTHANOREQUALTO; + break; + case ExpressionType::COMPARE_GREATERTHAN: + expr_type = ExpressionType::COMPARE_LESSTHAN; + break; + default: + break; + } + } + + type::Value value; + if (expr->GetChild(right_index)->GetExpressionType() == + ExpressionType::VALUE_CONSTANT) { + value = reinterpret_cast( + expr->GetModifiableChild(right_index)) + ->GetValue(); + } else { + value = type::ValueFactory::GetParameterOffsetValue( + reinterpret_cast( + expr->GetModifiableChild(right_index)) + ->GetValueIdx()) + .Copy(); + } + ValueCondition condition(col_name, expr_type, value); + selectivity = + Selectivity::ComputeSelectivity(predicate_table_stats, condition); + } else if (expr->GetExpressionType() == ExpressionType::CONJUNCTION_AND || + expr->GetExpressionType() == ExpressionType::CONJUNCTION_OR) { + double left_selectivity = CalculateSelectivityForPredicate( + predicate_table_stats, expr->GetChild(0)); + double right_selectivity = CalculateSelectivityForPredicate( + predicate_table_stats, expr->GetChild(1)); + if (expr->GetExpressionType() == ExpressionType::CONJUNCTION_AND) { + selectivity = left_selectivity * right_selectivity; + } else { + selectivity = left_selectivity + right_selectivity - + left_selectivity * right_selectivity; + } + } + return selectivity; +} + } // namespace util } // namespace optimizer } // namespace peloton diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index 1f3d9195038..4d49612cf41 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -6,7 +6,7 @@ // // Identification: src/storage/data_table.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -388,7 +388,7 @@ bool DataTable::InsertTuple(const AbstractTuple *tuple, ItemPointer location, } PELOTON_ASSERT((*index_entry_ptr)->block == location.block && - (*index_entry_ptr)->offset == location.offset); + (*index_entry_ptr)->offset == location.offset); // Increase the table's number of tuples by 1 IncreaseTupleCount(1); @@ -1092,7 +1092,12 @@ void DataTable::DropIndexWithOid(const oid_t &index_oid) { indexes_.Update(index_offset, nullptr); // Drop index column info - indexes_columns_[index_offset].clear(); + // indexes_columns_[index_offset].clear(); + + // Doing this because StatsStorage::AnalyzeStatsForAllTables + // assumes that the set is completely erased when the index is + // deleted. + indexes_columns_.erase(indexes_columns_.begin() + index_offset); } void DataTable::DropIndexes() { diff --git a/src/storage/tile_group_header.cpp b/src/storage/tile_group_header.cpp index 1e0b450144e..56a4cb37017 100644 --- a/src/storage/tile_group_header.cpp +++ b/src/storage/tile_group_header.cpp @@ -1,3 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// tile_group_header.cpp +// +// Identification: src/storage/tile_group_header.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + //===----------------------------------------------------------------------===// // // Peloton @@ -239,7 +251,8 @@ oid_t TileGroupHeader::GetActiveTupleCount() const { tuple_slot_id++) { txn_id_t tuple_txn_id = GetTransactionId(tuple_slot_id); if (tuple_txn_id != INVALID_TXN_ID) { - PELOTON_ASSERT(tuple_txn_id == INITIAL_TXN_ID); + // TODO Copying what Tiyanu did + // PELOTON_ASSERT(tuple_txn_id == INITIAL_TXN_ID); active_tuple_slots++; } } diff --git a/src/traffic_cop/traffic_cop.cpp b/src/traffic_cop/traffic_cop.cpp index a87d99c0ac5..0c9d1a03d04 100644 --- a/src/traffic_cop/traffic_cop.cpp +++ b/src/traffic_cop/traffic_cop.cpp @@ -6,7 +6,7 @@ // // Identification: src/traffic_cop/traffic_cop.cpp // -// Copyright (c) 2015-17, Carnegie Mellon University Database Group +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// @@ -305,8 +305,10 @@ std::shared_ptr TrafficCop::PrepareStatement( tcop_txn_state_.emplace(txn, ResultType::SUCCESS); } + // Log the query only if we have a statement. if (settings::SettingsManager::GetBool(settings::SettingId::brain)) { - tcop_txn_state_.top().first->AddQueryString(query_string.c_str()); + tcop_txn_state_.top().first->AddQueryString( + query_string.c_str()); } // TODO(Tianyi) Move Statement Planing into Statement's method diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 94291523cdd..1385289866e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -48,6 +48,7 @@ set(TESTING_UTIL_STATS ${PROJECT_SOURCE_DIR}/test/statistics/testing_stats_ set(TESTING_UTIL_SQL ${PROJECT_SOURCE_DIR}/test/sql/testing_sql_util.cpp) set(TESTING_UTIL_INDEX ${PROJECT_SOURCE_DIR}/test/index/testing_index_util.cpp) set(TESTING_UTIL_CODEGEN ${PROJECT_SOURCE_DIR}/test/codegen/testing_codegen_util.cpp) +set(TESTING_UTIL_INDEX_SELECTION ${PROJECT_SOURCE_DIR}/test/brain/testing_index_selection_util.cpp) add_library(peloton-test-common EXCLUDE_FROM_ALL ${gmock_srcs} ${HARNESS} ${TESTING_UTIL_EXECUTOR} @@ -58,6 +59,7 @@ add_library(peloton-test-common EXCLUDE_FROM_ALL ${gmock_srcs} ${HARNESS} ${TESTING_UTIL_INDEX} ${TESTING_UTIL_SQL} ${TESTING_UTIL_CODEGEN} + ${TESTING_UTIL_INDEX_SELECTION} ) # --[ Add "make check" target @@ -71,37 +73,37 @@ add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} ${CTEST_FLAGS} --verbose) foreach(test_src ${test_srcs} ) #message("test_src = " ${test_src}) - + # get test file name - get_filename_component(test_bare_name ${test_src} NAME) + get_filename_component(test_bare_name ${test_src} NAME) string(REPLACE ".cpp" "" test_bare_name_without_extension ${test_bare_name}) string(REPLACE "\"" "" test_name ${test_bare_name_without_extension}) - + # create executable add_executable(${test_name} EXCLUDE_FROM_ALL ${test_src}) add_dependencies(check ${test_name}) - + #message("Correctness test: " ${test_name}) - + # link libraries - target_link_libraries(${test_name} peloton peloton-test-common) + target_link_libraries(${test_name} peloton peloton-test-common) - # set target properties + # set target properties set_target_properties(${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/test" COMMAND ${test_name} - ) - + ) + # add test add_test(${test_name} ${CMAKE_BINARY_DIR}/test/${test_name} --gtest_color=yes --gtest_output=xml:${CMAKE_BINARY_DIR}/test/${test_name}.xml) - + # leak suppression / whitelist set_property(TEST ${test_name} PROPERTY ENVIRONMENT "LSAN_OPTIONS=suppressions=${PROJECT_SOURCE_DIR}/test/leak_suppr.txt") - + endforeach(test_src ${test_srcs}) ################################################################################## @@ -112,32 +114,32 @@ endforeach(test_src ${test_srcs}) foreach(perf_src ${perf_srcs} ) list(REMOVE_ITEM test_srcs ${perf_src}) - + #message("test_srcs = " ${test_srcs}) #message("perf_src = " ${perf_src}) - - get_filename_component(perf_bare_name ${perf_src} NAME) + + get_filename_component(perf_bare_name ${perf_src} NAME) string(REPLACE ".cpp" "" perf_bare_name_without_extension ${perf_bare_name}) string(REPLACE "\"" "" perf_name ${perf_bare_name_without_extension}) - + # create executable add_executable(${perf_name} EXCLUDE_FROM_ALL ${perf_src}) add_dependencies(check ${perf_name}) - + #message("Performance test: " ${perf_name}) - + # link libraries - target_link_libraries(${perf_name} peloton peloton-test-common) + target_link_libraries(${perf_name} peloton peloton-test-common) - # set target properties + # set target properties set_target_properties(${perf_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/test" COMMAND ${perf_name} - ) - + ) + # add test add_test(${perf_name} ${CMAKE_BINARY_DIR}/test/${perf_name} --gtest_color=yes --gtest_output=xml:${CMAKE_BINARY_DIR}/test/${perf_name}.xml) - + endforeach(perf_src ${perf_srcs}) diff --git a/test/brain/index_selection_test.cpp b/test/brain/index_selection_test.cpp new file mode 100644 index 00000000000..0c2450969fe --- /dev/null +++ b/test/brain/index_selection_test.cpp @@ -0,0 +1,653 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// index_selection_test.cpp +// +// Identification: test/brain/index_selection_test.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include + +#include "binder/bind_node_visitor.h" +#include "brain/index_selection.h" +#include "brain/what_if_index.h" +#include "catalog/index_catalog.h" +#include "common/harness.h" +#include "concurrency/transaction_manager_factory.h" +#include "optimizer/stats/column_stats.h" +#include "optimizer/stats/stats_storage.h" +#include "optimizer/stats/table_stats.h" +#include "sql/testing_sql_util.h" + +#include "brain/testing_index_selection_util.h" + +namespace peloton { +namespace test { + +using namespace index_selection; + +//===--------------------------------------------------------------------===// +// IndexSelectionTest +//===--------------------------------------------------------------------===// + +class IndexSelectionTest : public PelotonTest {}; + +/** + * @brief Verify if admissible index count is correct for a given + * query workload. + */ +TEST_F(IndexSelectionTest, AdmissibleIndexesTest) { + // Parameters + std::string table_name = "table1"; + std::string database_name = DEFAULT_DB_NAME; + long num_tuples = 10; + + size_t max_index_cols = 2; + size_t enumeration_threshold = 2; + size_t num_indexes = 10; + + brain::IndexSelectionKnobs knobs = {max_index_cols, enumeration_threshold, + num_indexes}; + + TableSchema schema(table_name, {{"a", TupleValueType::INTEGER}, + {"b", TupleValueType::INTEGER}, + {"c", TupleValueType::INTEGER}, + {"d", TupleValueType::INTEGER}}); + TestingIndexSelectionUtil testing_util(database_name); + testing_util.CreateTable(schema); + testing_util.InsertIntoTable(schema, num_tuples); + + // Form the query strings + std::vector query_strs; + std::vector admissible_indexes; + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE a < 1 or b > 4 GROUP BY a"); + // 2 indexes will be choosen in GetAdmissibleIndexes - a, b + admissible_indexes.push_back(2); + query_strs.push_back("SELECT a, b, c FROM " + table_name + + " WHERE a < 1 or b > 4 ORDER BY a"); + admissible_indexes.push_back(2); + query_strs.push_back("DELETE FROM " + table_name + " WHERE a < 1 or b > 4"); + admissible_indexes.push_back(2); + query_strs.push_back("UPDATE " + table_name + + " SET a = 45 WHERE a < 1 or b > 4"); + + admissible_indexes.push_back(2); + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + // Create a new workload + brain::Workload workload(query_strs, database_name, txn); + EXPECT_GT(workload.Size(), 0); + + // Verify the admissible indexes. + auto queries = workload.GetQueries(); + for (unsigned long i = 0; i < queries.size(); i++) { + brain::Workload w(queries[i], workload.GetDatabaseName()); + brain::IndexSelection is(w, knobs, txn); + + brain::IndexConfiguration ic; + is.GetAdmissibleIndexes(queries[i].first, ic); + LOG_TRACE("Admissible indexes %ld, %s", i, ic.ToString().c_str()); + auto indexes = ic.GetIndexes(); + EXPECT_EQ(ic.GetIndexCount(), admissible_indexes[i]); + } + txn_manager.CommitTransaction(txn); +} + +/** + * @brief Tests the first iteration of the candidate index generation + * algorithm i.e. generating single column candidate indexes per query. + */ +TEST_F(IndexSelectionTest, CandidateIndexGenerationTest) { + std::string database_name = DEFAULT_DB_NAME; + + // Config knobs + size_t max_index_cols = 1; + size_t enumeration_threshold = 2; + size_t num_indexes = 10; + int num_rows = 2000; + + brain::IndexSelectionKnobs knobs = {max_index_cols, enumeration_threshold, + num_indexes}; + + TestingIndexSelectionUtil testing_util(database_name); + auto config = + testing_util.GetQueryStringsWorkload(QueryStringsWorkloadType::A); + auto table_schemas = config.first; + auto query_strings = config.second; + + // Create all the required tables for this workloads. + for (auto table_schema : table_schemas) { + testing_util.CreateTable(table_schema); + } + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + brain::Workload workload(query_strings, database_name, txn); + EXPECT_EQ(workload.Size(), query_strings.size()); + + // Generate candidate configurations. + // The table doesn't have any tuples, so the admissible indexes won't help + // any of the queries --> candidate set should be 0. + brain::IndexConfiguration candidate_config; + brain::IndexConfiguration admissible_config; + + brain::IndexSelection index_selection(workload, knobs, txn); + index_selection.GenerateCandidateIndexes(candidate_config, admissible_config, + workload); + + LOG_TRACE("Admissible Index Count: %ld", admissible_config.GetIndexCount()); + LOG_TRACE("Admissible Indexes: %s", admissible_config.ToString().c_str()); + LOG_TRACE("Candidate Indexes: %s", candidate_config.ToString().c_str()); + + EXPECT_EQ(admissible_config.GetIndexCount(), 2); + // TODO: There is no data in the table. Indexes should not help. Should return + // 0. But currently, the cost with index for a query if 0.0 if there are no + // rows in the table where as the cost without the index is 1.0. This needs to + // be fixed in the cost model. Or is this behaviour of optimizer fine? + // EXPECT_EQ(candidate_config.GetIndexCount(), 0); + EXPECT_EQ(candidate_config.GetIndexCount(), 2); + + // Insert tuples into the tables. + for (auto table_schema : table_schemas) { + testing_util.InsertIntoTable(table_schema, num_rows); + } + + candidate_config.Clear(); + admissible_config.Clear(); + + brain::IndexSelection is(workload, knobs, txn); + is.GenerateCandidateIndexes(candidate_config, admissible_config, workload); + + LOG_TRACE("Admissible Index Count: %ld", admissible_config.GetIndexCount()); + LOG_TRACE("Admissible Indexes: %s", admissible_config.ToString().c_str()); + LOG_TRACE("Candidate Indexes: %s", candidate_config.ToString().c_str()); + EXPECT_EQ(admissible_config.GetIndexCount(), 2); + // Indexes help reduce the cost of the queries, so they get selected. + EXPECT_EQ(candidate_config.GetIndexCount(), 2); + + auto admissible_indexes = admissible_config.GetIndexes(); + auto candidate_indexes = candidate_config.GetIndexes(); + + // Columns - a and c + std::set expected_cols = {0, 2}; + + for (auto col : expected_cols) { + std::vector cols = {col}; + bool found = false; + for (auto index : admissible_indexes) { + found |= (index->column_oids == cols); + } + EXPECT_TRUE(found); + + found = false; + for (auto index : candidate_indexes) { + found |= (index->column_oids == cols); + } + EXPECT_TRUE(found); + } + + txn_manager.CommitTransaction(txn); +} + +/** + * @brief Tests multi column index generation from a set of candidate indexes. + */ +TEST_F(IndexSelectionTest, MultiColumnIndexGenerationTest) { + std::string database_name = DEFAULT_DB_NAME; + + brain::IndexConfiguration candidates; + brain::IndexConfiguration single_column_indexes; + brain::IndexConfiguration result; + brain::IndexConfiguration expected; + brain::Workload workload(database_name); + + size_t max_index_cols = 5; + size_t enumeration_threshold = 2; + size_t num_indexes = 10; + + brain::IndexSelectionKnobs knobs = {max_index_cols, enumeration_threshold, + num_indexes}; + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + brain::IndexSelection index_selection(workload, knobs, txn); + + std::vector cols; + + // Database: 1 + // Table: 1 + // Column: 1 + auto a11 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 1, 1)); + // Column: 2 + auto b11 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 1, 2)); + // Column: 3 + auto c11 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 1, 3)); + // Column: 1, 2 + cols = {1, 2}; + auto ab11 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 1, cols)); + // Column: 1, 3 + cols = {1, 3}; + auto ac11 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 1, cols)); + // Column: 2, 3 + cols = {2, 3}; + auto bc11 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 1, cols)); + // Column: 2, 1 + cols = {2, 1}; + auto ba11 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 1, cols)); + + // Database: 1 + // Table: 2 + // Column: 1 + auto a12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, 1)); + // Column: 2 + auto b12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, 2)); + // Column: 3 + auto c12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, 3)); + // Column: 2, 3 + cols = {2, 3}; + auto bc12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, cols)); + // Column: 1, 3 + cols = {1, 3}; + auto ac12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, cols)); + // Column: 3, 1 + cols = {3, 1}; + auto ca12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, cols)); + // Column: 3, 2 + cols = {3, 2}; + auto cb12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, cols)); + // Column: 1, 2, 3 + cols = {1, 2, 3}; + auto abc12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, cols)); + // Column: 2, 3, 1 + cols = {2, 3, 1}; + auto bca12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, cols)); + // Column: 1, 3, 2 + cols = {1, 3, 2}; + auto acb12 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(1, 2, cols)); + + // Database: 2 + // Table: 1 + // Column: 1 + auto a21 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(2, 1, 1)); + // Column: 2 + auto b21 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(2, 1, 2)); + // Column: 3 + auto c21 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(2, 1, 3)); + // Column: 1, 2 + cols = {1, 2}; + auto ab21 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(2, 1, cols)); + // Column: 1, 3 + cols = {1, 3}; + auto ac21 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(2, 1, cols)); + // Column: 1, 2, 3 + cols = {1, 2, 3}; + auto abc21 = index_selection.AddConfigurationToPool( + brain::HypotheticalIndexObject(2, 1, cols)); + + std::set> indexes; + + indexes = {a11, b11, c11, a12, b12, c12, a21, b21, c21}; + single_column_indexes = {indexes}; + + indexes = {a11, b11, bc12, ac12, c12, a21, abc21}; + candidates = {indexes}; + + index_selection.GenerateMultiColumnIndexes(candidates, single_column_indexes, + result); + + // candidates union (candidates * single_column_indexes) + indexes = {// candidates + a11, b11, bc12, ac12, c12, a21, abc21, + // crossproduct + ab11, ac11, ba11, bc11, bca12, acb12, ca12, cb12, ab21, ac21}; + expected = {indexes}; + + auto chosen_indexes = result.GetIndexes(); + auto expected_indexes = expected.GetIndexes(); + + for (auto index : chosen_indexes) { + int count = 0; + for (auto expected_index : expected_indexes) { + auto index_object = *(index.get()); + auto expected_index_object = *(expected_index.get()); + if (index_object == expected_index_object) count++; + } + EXPECT_EQ(1, count); + } + EXPECT_EQ(expected_indexes.size(), chosen_indexes.size()); + + txn_manager.CommitTransaction(txn); +} + +/** + * @brief end-to-end test which takes in a workload of queries + * and spits out the set of indexes that are the best ones for the + * workload. + */ +TEST_F(IndexSelectionTest, IndexSelectionTest1) { + std::string database_name = DEFAULT_DB_NAME; + + int num_rows = 2000; // number of rows to be inserted. + + TestingIndexSelectionUtil testing_util(database_name); + auto config = + testing_util.GetQueryStringsWorkload(QueryStringsWorkloadType::B); + auto table_schemas = config.first; + auto query_strings = config.second; + + // Create and populate tables. + for (auto table_schema : table_schemas) { + testing_util.CreateTable(table_schema); + testing_util.InsertIntoTable(table_schema, num_rows); + } + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + brain::Workload workload(query_strings, database_name, txn); + EXPECT_EQ(workload.Size(), query_strings.size()); + + brain::IndexConfiguration best_config; + std::set> expected_indexes; + brain::IndexConfiguration expected_config; + + /** Test 1 + * Choose only 1 index with 1 column + * it should choose {B} + */ + size_t max_index_cols = 1; // multi-column index limit + size_t enumeration_threshold = 2; // naive enumeration threshold + size_t num_indexes = 1; // top num_indexes will be returned. + + brain::IndexSelectionKnobs knobs = {max_index_cols, enumeration_threshold, + num_indexes}; + + brain::IndexSelection is = {workload, knobs, txn}; + + is.GetBestIndexes(best_config); + + LOG_TRACE("Best Indexes: %s", best_config.ToString().c_str()); + LOG_TRACE("Best Index Count: %ld", best_config.GetIndexCount()); + + EXPECT_EQ(1, best_config.GetIndexCount()); + + expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy2", {"b"}, &is)}; + expected_config = {expected_indexes}; + + EXPECT_TRUE(expected_config == best_config); + + /** Test 2 + * Choose 2 indexes with 1 column + * it should choose {A} and {B} + */ + max_index_cols = 1; + enumeration_threshold = 2; + num_indexes = 2; + knobs = {max_index_cols, enumeration_threshold, num_indexes}; + is = {workload, knobs, txn}; + + is.GetBestIndexes(best_config); + + LOG_TRACE("Best Indexes: %s", best_config.ToString().c_str()); + LOG_TRACE("Best Index Count: %ld", best_config.GetIndexCount()); + + EXPECT_EQ(2, best_config.GetIndexCount()); + + expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy2", {"a"}, &is), + testing_util.CreateHypotheticalIndex("dummy2", {"b"}, &is)}; + expected_config = {expected_indexes}; + + EXPECT_TRUE(expected_config == best_config); + + /** Test 3 + * Choose 1 index with up to 2 columns + * it should choose {BA} + */ + max_index_cols = 2; + enumeration_threshold = 2; + num_indexes = 1; + knobs = {max_index_cols, enumeration_threshold, num_indexes}; + is = {workload, knobs, txn}; + + is.GetBestIndexes(best_config); + + LOG_TRACE("Best Indexes: %s", best_config.ToString().c_str()); + LOG_TRACE("Best Index Count: %ld", best_config.GetIndexCount()); + + EXPECT_EQ(1, best_config.GetIndexCount()); + + expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy2", {"b", "a"}, &is)}; + expected_config = {expected_indexes}; + + EXPECT_TRUE(expected_config == best_config); + + /** Test 4 + * Choose 2 indexes with up to 2 columns + * it should choose {AB} and {BC} + */ + max_index_cols = 2; + enumeration_threshold = 2; + num_indexes = 2; + knobs = {max_index_cols, enumeration_threshold, num_indexes}; + is = {workload, knobs, txn}; + + is.GetBestIndexes(best_config); + + LOG_TRACE("Best Indexes: %s", best_config.ToString().c_str()); + LOG_TRACE("Best Index Count: %ld", best_config.GetIndexCount()); + + EXPECT_EQ(2, best_config.GetIndexCount()); + + expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy2", {"a", "b"}, &is), + testing_util.CreateHypotheticalIndex("dummy2", {"b", "c"}, &is)}; + expected_config = {expected_indexes}; + + EXPECT_TRUE(expected_config == best_config); + + /** Test 5 + * Choose 4 indexes with up to 2 columns + * it should choose {AB}, {BC} from exhaustive and {AC} or {CA} from greedy + * more indexes donot give any added benefit + */ + max_index_cols = 2; + enumeration_threshold = 2; + num_indexes = 4; + knobs = {max_index_cols, enumeration_threshold, num_indexes}; + is = {workload, knobs, txn}; + + is.GetBestIndexes(best_config); + + LOG_TRACE("Best Indexes: %s", best_config.ToString().c_str()); + LOG_TRACE("Best Index Count: %ld", best_config.GetIndexCount()); + + EXPECT_EQ(3, best_config.GetIndexCount()); + + expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy2", {"a", "b"}, &is), + testing_util.CreateHypotheticalIndex("dummy2", {"a", "c"}, &is), + testing_util.CreateHypotheticalIndex("dummy2", {"b", "c"}, &is)}; + expected_config = {expected_indexes}; + + std::set> + alternate_expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy2", {"a", "b"}, &is), + testing_util.CreateHypotheticalIndex("dummy2", {"c", "a"}, &is), + testing_util.CreateHypotheticalIndex("dummy2", {"b", "c"}, &is)}; + brain::IndexConfiguration alternate_expected_config = { + alternate_expected_indexes}; + + // It can choose either AC or CA based on the distribution of C and A + EXPECT_TRUE((expected_config == best_config) || + (alternate_expected_config == best_config)); + + /** Test 6 + * Choose 1 index with up to 3 columns + * it should choose {BA} + * more indexes / columns donot give any added benefit + */ + max_index_cols = 3; + enumeration_threshold = 2; + num_indexes = 1; + knobs = {max_index_cols, enumeration_threshold, num_indexes}; + is = {workload, knobs, txn}; + + is.GetBestIndexes(best_config); + + LOG_TRACE("Best Indexes: %s", best_config.ToString().c_str()); + LOG_TRACE("Best Index Count: %ld", best_config.GetIndexCount()); + + EXPECT_EQ(1, best_config.GetIndexCount()); + + expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy2", {"b", "a"}, &is)}; + expected_config = {expected_indexes}; + + EXPECT_TRUE(expected_config == best_config); + + /** Test 7 + * Choose 2 indexes with up to 2 columns + * it should choose {BA} and {AC} + * This has a naive threshold of 1, it chooses BA from exhaustive + * enumeration and AC greedily + */ + max_index_cols = 2; + enumeration_threshold = 1; + num_indexes = 2; + knobs = {max_index_cols, enumeration_threshold, num_indexes}; + is = {workload, knobs, txn}; + + is.GetBestIndexes(best_config); + + LOG_TRACE("Best Indexes: %s", best_config.ToString().c_str()); + LOG_TRACE("Best Index Count: %ld", best_config.GetIndexCount()); + + EXPECT_EQ(2, best_config.GetIndexCount()); + + expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy2", {"b", "a"}, &is), + testing_util.CreateHypotheticalIndex("dummy2", {"a", "c"}, &is)}; + expected_config = {expected_indexes}; + + EXPECT_TRUE(expected_config == best_config); + + txn_manager.CommitTransaction(txn); +} + +/** + * @brief end-to-end test which takes in a workload of queries + * and spits out the set of indexes that are the best ones for more + * complex workloads. + */ +TEST_F(IndexSelectionTest, IndexSelectionTest2) { + std::string database_name = DEFAULT_DB_NAME; + int num_rows = 2000; // number of rows to be inserted. + + TestingIndexSelectionUtil testing_util(database_name); + auto config = + testing_util.GetQueryStringsWorkload(QueryStringsWorkloadType::C); + auto table_schemas = config.first; + auto query_strings = config.second; + + // Create and populate tables. + for (auto table_schema : table_schemas) { + testing_util.CreateTable(table_schema); + testing_util.InsertIntoTable(table_schema, num_rows); + } + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + brain::Workload workload(query_strings, database_name, txn); + EXPECT_EQ(workload.Size(), query_strings.size()); + + brain::IndexConfiguration best_config; + std::set> expected_indexes; + brain::IndexConfiguration expected_config; + + /** Test 1 + * Choose only 1 index with up to 3 column + * it should choose {BCA} + */ + size_t max_index_cols = 3; + size_t enumeration_threshold = 2; + size_t num_indexes = 1; + brain::IndexSelectionKnobs knobs = {max_index_cols, enumeration_threshold, + num_indexes}; + brain::IndexSelection is = {workload, knobs, txn}; + + is.GetBestIndexes(best_config); + + LOG_TRACE("Best Indexes: %s", best_config.ToString().c_str()); + LOG_TRACE("Best Index Count: %ld", best_config.GetIndexCount()); + + EXPECT_EQ(1, best_config.GetIndexCount()); + + expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy3", {"b", "c", "a"}, &is)}; + expected_config = {expected_indexes}; + + EXPECT_TRUE(expected_config == best_config); + + /** Test 2 + * Choose only 2 indexes with up to 3 column + * it should choose some permutation of {BCA} and {DEF} + */ + max_index_cols = 3; + enumeration_threshold = 2; + num_indexes = 2; + knobs = {max_index_cols, enumeration_threshold, num_indexes}; + is = {workload, knobs, txn}; + + is.GetBestIndexes(best_config); + + LOG_TRACE("Best Indexes: %s", best_config.ToString().c_str()); + LOG_TRACE("Best Index Count: %ld", best_config.GetIndexCount()); + + EXPECT_EQ(2, best_config.GetIndexCount()); + + expected_indexes = { + testing_util.CreateHypotheticalIndex("dummy3", {"b", "c", "a"}, &is), + testing_util.CreateHypotheticalIndex("dummy3", {"d", "e", "f"}, &is)}; + expected_config = {expected_indexes}; + + EXPECT_TRUE(expected_config == best_config); + + txn_manager.CommitTransaction(txn); +} + +} // namespace test +} // namespace peloton diff --git a/test/brain/testing_index_selection_util.cpp b/test/brain/testing_index_selection_util.cpp new file mode 100644 index 00000000000..4a2840a67b2 --- /dev/null +++ b/test/brain/testing_index_selection_util.cpp @@ -0,0 +1,335 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// testing_index_selection_util.cpp +// +// Identification: test/brain/testing_index_selection_util.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/testing_index_selection_util.h" +#include "brain/what_if_index.h" +#include "common/harness.h" +#include "optimizer/stats/stats_storage.h" +#include "sql/testing_sql_util.h" +#include "planner/index_scan_plan.h" + +namespace peloton { + +namespace test { + +namespace index_selection { + +TestingIndexSelectionUtil::TestingIndexSelectionUtil(std::string db_name) + : database_name_(db_name) { + srand(time(NULL)); + CreateDatabase(); +} + +TestingIndexSelectionUtil::~TestingIndexSelectionUtil() { + for (auto it = tables_created_.begin(); it != tables_created_.end(); it++) { + DropTable(it->first); + } + DropDatabase(); +} + +std::pair, std::vector> +TestingIndexSelectionUtil::GetQueryStringsWorkload( + QueryStringsWorkloadType type) { + std::vector query_strs; + std::vector table_schemas; + std::string table_name; + // Procedure to add a new workload: + // 1. Create all the table schemas required for the workload queries. + // 2. Create all the required workload query strings. + switch (type) { + case A: { + table_name = "dummy1"; + table_schemas.emplace_back( + table_name, + std::initializer_list>{ + {"a", TupleValueType::INTEGER}, + {"b", TupleValueType::INTEGER}, + {"c", TupleValueType::INTEGER}, + {"d", TupleValueType::INTEGER}}); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE a = 160 and a = 250"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE c = 190 and c = 250"); + query_strs.push_back("SELECT a, b, c FROM " + table_name + + " WHERE a = 190 and c = 250"); + break; + } + case B: { + table_name = "dummy2"; + table_schemas.emplace_back( + table_name, + std::initializer_list>{ + {"a", TupleValueType::INTEGER}, + {"b", TupleValueType::INTEGER}, + {"c", TupleValueType::INTEGER}, + {"d", TupleValueType::INTEGER}}); + query_strs.push_back("SELECT * FROM " + table_name + " WHERE a = 160"); + query_strs.push_back("SELECT * FROM " + table_name + " WHERE b = 190"); + query_strs.push_back("SELECT * FROM " + table_name + " WHERE b = 81"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE a = 190 and b = 250"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE a = 190 and b = 250"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE b = 190 and a = 250"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE b = 190 and c = 250"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE b = 190 and c = 250"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE a = 190 and c = 250"); + break; + } + case C: { + table_name = "dummy3"; + table_schemas.emplace_back( + table_name, + std::initializer_list>{ + {"a", TupleValueType::INTEGER}, + {"b", TupleValueType::INTEGER}, + {"c", TupleValueType::INTEGER}, + {"d", TupleValueType::INTEGER}, + {"e", TupleValueType::INTEGER}, + {"f", TupleValueType::INTEGER}, + {"g", TupleValueType::INTEGER}}); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE a = 160 and b = 199 and c = 1009"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE b = 190 and a = 677 and c = 987"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE b = 81 and c = 123 and a = 122"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE b = 81 and c = 123 and d = 122"); + query_strs.push_back("SELECT * FROM " + table_name + " WHERE b = 81"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE b = 81 and c = 12"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE d = 81 and e = 123 and f = 122"); + query_strs.push_back("SELECT * FROM " + table_name + " WHERE d = 81"); + query_strs.push_back("SELECT * FROM " + table_name + + " WHERE d = 81 and e = 12"); + break; + } + case D: { + std::string table_name_1 = "d_student"; + table_schemas.emplace_back( + table_name_1, + std::initializer_list>{ + {"name", TupleValueType::STRING}, + {"gpa", TupleValueType::INTEGER}, + {"id", TupleValueType::INTEGER}, + {"cgpa", TupleValueType::INTEGER}}); + std::string table_name_2 = "d_college"; + table_schemas.emplace_back( + table_name_2, + std::initializer_list>{ + {"name", TupleValueType::STRING}, + {"city", TupleValueType::STRING}, + {"county", TupleValueType::STRING}, + {"state", TupleValueType::STRING}, + {"country", TupleValueType::STRING}, + {"enrolment", TupleValueType::INTEGER}}); + std::string table_name_3 = "d_course"; + table_schemas.emplace_back( + table_name_3, + std::initializer_list>{ + {"name", TupleValueType::STRING}, + {"id", TupleValueType::INTEGER}}); + query_strs.push_back("SELECT * FROM " + table_name_1 + + " WHERE name = 'vamshi' and id = 40"); + query_strs.push_back("SELECT * FROM " + table_name_1 + " WHERE id = 100"); + query_strs.push_back("SELECT * FROM " + table_name_1 + + " WHERE name = 'siva' and id = 50"); + query_strs.push_back("SELECT * FROM " + table_name_1 + + " WHERE name = 'priyatham' and id = 60"); + query_strs.push_back("SELECT * FROM " + table_name_1 + + " WHERE id = 69 and name = 'vamshi'"); + query_strs.push_back("SELECT * FROM " + table_name_1 + " WHERE id = 4"); + query_strs.push_back("SELECT * FROM " + table_name_1 + " WHERE id = 10"); + query_strs.push_back("SELECT cgpa FROM " + table_name_1 + + " WHERE name = 'vam'"); + query_strs.push_back("SELECT name FROM " + table_name_1 + + " WHERE cgpa = 3"); + query_strs.push_back("SELECT name FROM " + table_name_1 + + " WHERE cgpa = 9 and gpa = 9"); + query_strs.push_back("SELECT * FROM " + table_name_1 + + " WHERE cgpa = 9 and gpa = 9 and name = 'vam'"); + query_strs.push_back("SELECT * FROM " + table_name_1 + + " WHERE gpa = 9 and name = 'vam' and cgpa = 9"); + query_strs.push_back("SELECT country FROM " + table_name_2 + + " WHERE name = 'cmu'"); + query_strs.push_back("UPDATE " + table_name_2 + + " set name = 'cmu' where country = 'usa'"); + query_strs.push_back("UPDATE " + table_name_2 + + " set name = 'berkeley' where country = 'usa'"); + query_strs.push_back("DELETE FROM " + table_name_1 + + " where name = 'vam'"); + query_strs.push_back("DELETE FROM " + table_name_2 + + " where name = 'vam'"); + query_strs.push_back("DELETE FROM " + table_name_1 + " where id = 1"); + query_strs.push_back( + "SELECT * FROM d_student s inner join d_college c on s.name = " + "c.name inner join d_course co on c.name = co.name"); + query_strs.push_back( + "SELECT * FROM d_student join d_college on d_student.name = " + "d_college.name"); + query_strs.push_back("SELECT * FROM " + table_name_1 + " t1 ," + + table_name_2 + " t2 where t1.name = 'vam'"); + break; + } + default: + PELOTON_ASSERT(false); + } + return std::make_pair(table_schemas, query_strs); +} + +// Creates a new table with the provided schema. +void TestingIndexSelectionUtil::CreateTable(TableSchema schema) { + // Create table. + std::ostringstream s_stream; + s_stream << "CREATE TABLE " << schema.table_name << " ("; + for (auto i = 0UL; i < schema.cols.size(); i++) { + s_stream << schema.cols[i].first; + s_stream << " "; + switch (schema.cols[i].second) { + case FLOAT: + s_stream << "FLOAT"; + break; + case INTEGER: + s_stream << "INT"; + break; + case STRING: + s_stream << "VARCHAR(30)"; + break; + default: + PELOTON_ASSERT(false); + } + if (i < (schema.cols.size() - 1)) { + s_stream << ", "; + } + } + s_stream << ");"; + LOG_TRACE("Create table: %s", s_stream.str().c_str()); + TestingSQLUtil::ExecuteSQLQuery(s_stream.str()); +} + +// Inserts specified number of tuples into the table with random values. +void TestingIndexSelectionUtil::InsertIntoTable(TableSchema schema, + long num_tuples) { + // Insert tuples into table + for (int i = 0; i < num_tuples; i++) { + std::ostringstream oss; + oss << "INSERT INTO " << schema.table_name << " VALUES ("; + for (auto col = 0UL; col < schema.cols.size(); col++) { + auto type = schema.cols[col].second; + switch (type) { + case INTEGER: + oss << rand() % 1000; + break; + case FLOAT: + oss << (float)(rand() % 100); + break; + case STRING: + oss << "'str" << rand() % RAND_MAX << "'"; + break; + default: + PELOTON_ASSERT(false); + } + if (col < (schema.cols.size() - 1)) { + oss << ", "; + } + } + oss << ");"; + LOG_TRACE("Inserting: %s", oss.str().c_str()); + TestingSQLUtil::ExecuteSQLQuery(oss.str()); + } + GenerateTableStats(); +} + +void TestingIndexSelectionUtil::GenerateTableStats() { + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + optimizer::StatsStorage *stats_storage = + optimizer::StatsStorage::GetInstance(); + ResultType result = stats_storage->AnalyzeStatsForAllTables(txn); + PELOTON_ASSERT(result == ResultType::SUCCESS); + (void)result; + txn_manager.CommitTransaction(txn); +} + +// Factory method +// Returns a what-if index on the columns at the given +// offset of the table. +std::shared_ptr +TestingIndexSelectionUtil::CreateHypotheticalIndex( + std::string table_name, std::vector index_col_names, + brain::IndexSelection *is) { + // We need transaction to get table object. + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + // Get the existing table so that we can find its oid and the cols oids. + auto table_object = catalog::Catalog::GetInstance()->GetTableObject( + database_name_, "public", table_name, txn); + auto col_obj_pairs = table_object->GetColumnObjects(); + + std::vector col_ids; + auto database_oid = table_object->GetDatabaseOid(); + auto table_oid = table_object->GetTableOid(); + + // Find the column oids. + for (auto col_name : index_col_names) { + for (auto it = col_obj_pairs.begin(); it != col_obj_pairs.end(); it++) { + LOG_DEBUG("Table id: %d, Column id: %d, Offset: %d, Name: %s", + it->second->GetTableOid(), it->second->GetColumnId(), + it->second->GetColumnOffset(), + it->second->GetColumnName().c_str()); + if (col_name == it->second->GetColumnName()) { + col_ids.push_back(it->second->GetColumnId()); + } + } + } + PELOTON_ASSERT(col_ids.size() == index_col_names.size()); + + std::shared_ptr index_obj; + + if (is == nullptr) { + auto obj_ptr = + new brain::HypotheticalIndexObject(database_oid, table_oid, col_ids); + index_obj = std::shared_ptr(obj_ptr); + } else { + auto obj = brain::HypotheticalIndexObject(database_oid, table_oid, col_ids); + index_obj = is->AddConfigurationToPool(obj); + } + + txn_manager.CommitTransaction(txn); + return index_obj; +} + +void TestingIndexSelectionUtil::CreateDatabase() { + std::string create_db_str = "CREATE DATABASE " + database_name_ + ";"; + TestingSQLUtil::ExecuteSQLQuery(create_db_str); +} + +void TestingIndexSelectionUtil::DropDatabase() { + std::string create_str = "DROP DATABASE " + database_name_ + ";"; + TestingSQLUtil::ExecuteSQLQuery(create_str); +} + +void TestingIndexSelectionUtil::DropTable(std::string table_name) { + std::string create_str = "DROP TABLE " + table_name + ";"; + TestingSQLUtil::ExecuteSQLQuery(create_str); +} + +} // namespace index_selection +} // namespace test +} // namespace peloton diff --git a/test/brain/what_if_index_test.cpp b/test/brain/what_if_index_test.cpp new file mode 100644 index 00000000000..22d26aad9e9 --- /dev/null +++ b/test/brain/what_if_index_test.cpp @@ -0,0 +1,489 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// what_if_index_test.cpp +// +// Identification: test/brain/what_if_index_test.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/what_if_index.h" +#include "common/harness.h" +#include "optimizer/stats/stats_storage.h" +#include "sql/testing_sql_util.h" +#include "planner/index_scan_plan.h" + +#include "brain/testing_index_selection_util.h" + +namespace peloton { +namespace test { + +using namespace index_selection; + +//===--------------------------------------------------------------------===// +// WhatIfIndex Tests +//===--------------------------------------------------------------------===// +class WhatIfIndexTests : public PelotonTest { + public: + WhatIfIndexTests() {} +}; + +TEST_F(WhatIfIndexTests, SingleColTest) { + std::string db_name = DEFAULT_DB_NAME; + int num_rows = 100; + + TableSchema schema("table1", {{"a", TupleValueType::INTEGER}, + {"b", TupleValueType::INTEGER}, + {"c", TupleValueType::INTEGER}, + {"d", TupleValueType::INTEGER}}); + + TestingIndexSelectionUtil testing_util(db_name); + testing_util.CreateTable(schema); + testing_util.InsertIntoTable(schema, num_rows); + + // Form the query. + std::string query("SELECT a from " + schema.table_name + + " WHERE b = 100 and c = 5;"); + LOG_TRACE("Query: %s", query.c_str()); + + brain::IndexConfiguration config; + + std::unique_ptr stmt_list( + parser::PostgresParser::ParseSQLString(query)); + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto parser = parser::PostgresParser::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + std::unique_ptr binder( + new binder::BindNodeVisitor(txn, DEFAULT_DB_NAME)); + + // Get the first statement. + auto sql_statement = std::shared_ptr( + stmt_list.get()->PassOutStatement(0)); + + binder->BindNameToNode(sql_statement.get()); + + // 1. Get the optimized plan tree without the indexes (sequential scan) + auto result = brain::WhatIfIndex::GetCostAndBestPlanTree( + sql_statement, config, DEFAULT_DB_NAME, txn); + auto cost_without_index = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::SEQSCAN); + LOG_TRACE("Cost of the query without indexes: %lf", cost_without_index); + EXPECT_NE(result->plan, nullptr); + LOG_TRACE("%s", result->plan->GetInfo().c_str()); + + // 2. Get the optimized plan tree with 1 hypothetical indexes (indexes) + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"b"})); + + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_1 = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with 1 index: %lf", cost_with_index_1); + EXPECT_NE(result->plan, nullptr); + LOG_TRACE("%s", result->plan->GetInfo().c_str()); + + // 3. Get the optimized plan tree with 2 hypothetical indexes (indexes) + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"c"})); + + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_2 = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with 2 indexes: %lf", cost_with_index_2); + + EXPECT_LT(cost_with_index_1, cost_without_index); + EXPECT_LT(cost_with_index_2, cost_without_index); + EXPECT_NE(result->plan, nullptr); + LOG_TRACE("%s", result->plan->GetInfo().c_str()); + + txn_manager.CommitTransaction(txn); +} + +/** + * @brief This test checks if a hypothetical index on multiple columns + * helps a particular query. + */ +TEST_F(WhatIfIndexTests, MultiColumnTest1) { + std::string db_name = DEFAULT_DB_NAME; + int num_rows = 1000; + + TableSchema schema("table1", {{"a", TupleValueType::INTEGER}, + {"b", TupleValueType::INTEGER}, + {"c", TupleValueType::INTEGER}, + {"d", TupleValueType::INTEGER}}); + TestingIndexSelectionUtil testing_util(db_name); + testing_util.CreateTable(schema); + testing_util.InsertIntoTable(schema, num_rows); + + // Form the query + std::string query("SELECT a from " + schema.table_name + + " WHERE b = 200 and c = 100;"); + LOG_TRACE("Query: %s", query.c_str()); + + brain::IndexConfiguration config; + + std::unique_ptr stmt_list( + parser::PostgresParser::ParseSQLString(query)); + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto parser = parser::PostgresParser::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + std::unique_ptr binder( + new binder::BindNodeVisitor(txn, DEFAULT_DB_NAME)); + + // Get the first statement. + auto sql_statement = std::shared_ptr( + stmt_list.get()->PassOutStatement(0)); + + binder->BindNameToNode(sql_statement.get()); + + // Get the optimized plan tree without the indexes (sequential scan) + auto result = brain::WhatIfIndex::GetCostAndBestPlanTree( + sql_statement, config, DEFAULT_DB_NAME, txn); + auto cost_without_index = result->cost; + LOG_TRACE("Cost of the query without indexes {}: %lf", cost_without_index); + LOG_TRACE("%s", result->plan->GetInfo().c_str()); + + // Insert hypothetical catalog objects + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"a", "c"})); + + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_1 = result->cost; + LOG_TRACE("Cost of the query with index {'a', 'c'}: %lf", cost_with_index_1); + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::SEQSCAN); + EXPECT_DOUBLE_EQ(cost_without_index, cost_with_index_1); + LOG_TRACE("%s", result->plan->GetInfo().c_str()); + + config.Clear(); + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"a", "b"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_2 = result->cost; + LOG_TRACE("Cost of the query with index {'a', 'b'}: %lf", cost_with_index_2); + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::SEQSCAN); + EXPECT_DOUBLE_EQ(cost_without_index, cost_with_index_2); + LOG_TRACE("%s", result->plan->GetInfo().c_str()); + + config.Clear(); + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"b", "c"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_3 = result->cost; + LOG_TRACE("Cost of the query with index {'b', 'c'}: %lf", cost_with_index_3); + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + EXPECT_GT(cost_without_index, cost_with_index_3); + LOG_TRACE("%s", result->plan->GetInfo().c_str()); + + config.Clear(); + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"b"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_4 = result->cost; + EXPECT_LE(cost_with_index_3, cost_with_index_4); + + // The cost of using one index {1} should be greater than the cost + // of using both the indexes {1, 2} for the query. + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with index {'b'}: %lf", cost_with_index_4); + LOG_TRACE("%s", result->plan->GetInfo().c_str()); + + txn_manager.CommitTransaction(txn); +} + +TEST_F(WhatIfIndexTests, MultiColumnTest2) { + std::string db_name = DEFAULT_DB_NAME; + int num_rows = 1000; + + TableSchema schema("table1", {{"a", TupleValueType::INTEGER}, + {"b", TupleValueType::INTEGER}, + {"c", TupleValueType::INTEGER}, + {"d", TupleValueType::INTEGER}, + {"e", TupleValueType::INTEGER}, + {"f", TupleValueType::INTEGER}}); + TestingIndexSelectionUtil testing_util(db_name); + testing_util.CreateTable(schema); + testing_util.InsertIntoTable(schema, num_rows); + + // Form the query. + std::string query("SELECT a from " + schema.table_name + + " WHERE b = 500 AND e = 100;"); + LOG_TRACE("Query: %s", query.c_str()); + + brain::IndexConfiguration config; + + std::unique_ptr stmt_list( + parser::PostgresParser::ParseSQLString(query)); + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto parser = parser::PostgresParser::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + std::unique_ptr binder( + new binder::BindNodeVisitor(txn, DEFAULT_DB_NAME)); + + // Get the first statement. + auto sql_statement = std::shared_ptr( + stmt_list.get()->PassOutStatement(0)); + + binder->BindNameToNode(sql_statement.get()); + + // Get the optimized plan tree without the indexes (sequential scan) + auto result = brain::WhatIfIndex::GetCostAndBestPlanTree( + sql_statement, config, DEFAULT_DB_NAME, txn); + auto cost_without_index = result->cost; + LOG_TRACE("Cost of the query without indexes: %lf", cost_without_index); + + // Insert hypothetical catalog objects + // Index on cols a, b, c, d, e. + config.AddIndexObject(testing_util.CreateHypotheticalIndex( + schema.table_name, {"a", "b", "c", "d", "e"})); + + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_1 = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::SEQSCAN); + LOG_TRACE("Cost of the query with index {'a', 'b', 'c', 'd', 'e'}: %lf", + cost_with_index_1); + EXPECT_DOUBLE_EQ(cost_without_index, cost_with_index_1); + + config.Clear(); + config.AddIndexObject(testing_util.CreateHypotheticalIndex( + schema.table_name, {"a", "c", "d", "f"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_2 = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::SEQSCAN); + LOG_TRACE("Cost of the query with index {'a', 'c', 'd', 'f'}: %lf", + cost_with_index_2); + EXPECT_DOUBLE_EQ(cost_without_index, cost_with_index_2); + + config.Clear(); + config.AddIndexObject(testing_util.CreateHypotheticalIndex( + schema.table_name, {"a", "b", "d", "e"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_3 = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::SEQSCAN); + LOG_TRACE("Cost of the query with index {'a', 'b', 'd', 'e'}: %lf", + cost_with_index_3); + EXPECT_DOUBLE_EQ(cost_without_index, cost_with_index_3); + + config.Clear(); + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"b", "c", "e"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_4 = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with index {'b', 'c', 'e'}: %lf", + cost_with_index_4); + EXPECT_GT(cost_without_index, cost_with_index_4); + + config.Clear(); + config.AddIndexObject(testing_util.CreateHypotheticalIndex( + schema.table_name, {"b", "c", "d", "e"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_5 = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with index {'b', 'c', 'd', 'e'}: %lf", + cost_with_index_5); + EXPECT_GT(cost_without_index, cost_with_index_5); + + config.Clear(); + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"b", "e"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_6 = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with index {'b', 'e'}: %lf", cost_with_index_6); + EXPECT_GT(cost_without_index, cost_with_index_6); + EXPECT_GT(cost_with_index_5, cost_with_index_6); + EXPECT_GT(cost_with_index_4, cost_with_index_6); + + config.Clear(); + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"e"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_7 = result->cost; + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with index {'e'} : %lf", cost_with_index_7); + EXPECT_GT(cost_without_index, cost_with_index_7); + EXPECT_GT(cost_with_index_7, cost_with_index_6); + + config.Clear(); + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"b"})); + result = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_8 = result->cost; + LOG_TRACE("Cost of the query with index {'b'}: %lf", cost_with_index_8); + EXPECT_EQ(result->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + EXPECT_GT(cost_without_index, cost_with_index_8); + EXPECT_GT(cost_with_index_8, cost_with_index_6); + + txn_manager.CommitTransaction(txn); +} + +/** + * @brief This test checks if a hypothetical index on multiple columns + * helps a particular query. + */ +TEST_F(WhatIfIndexTests, MultiColumnTest3) { + std::string db_name = DEFAULT_DB_NAME; + int num_rows = 1000; + + TableSchema schema("table1", {{"a", TupleValueType::INTEGER}, + {"b", TupleValueType::INTEGER}, + {"c", TupleValueType::INTEGER}, + {"d", TupleValueType::INTEGER}}); + TestingIndexSelectionUtil testing_util(db_name); + testing_util.CreateTable(schema); + testing_util.InsertIntoTable(schema, num_rows); + + // Form the query + std::string query1("SELECT a from " + schema.table_name + + " WHERE a = 50 and b = 200 and c = 100 and d = 50;"); + std::string query2("SELECT a from " + schema.table_name + + " WHERE c = 100 and a = 50 and d = 1 and b = 123;"); + std::string query3("SELECT a from " + schema.table_name + + " WHERE d = 100 and c = 50 and b = 1 and a = 13;"); + LOG_TRACE("Query1: %s", query1.c_str()); + LOG_TRACE("Query2: %s", query2.c_str()); + LOG_TRACE("Query3: %s", query3.c_str()); + + brain::IndexConfiguration config; + + std::unique_ptr stmt_list1( + parser::PostgresParser::ParseSQLString(query1)); + std::unique_ptr stmt_list2( + parser::PostgresParser::ParseSQLString(query2)); + std::unique_ptr stmt_list3( + parser::PostgresParser::ParseSQLString(query3)); + + auto &txn_manager = concurrency::TransactionManagerFactory::GetInstance(); + auto parser = parser::PostgresParser::GetInstance(); + auto txn = txn_manager.BeginTransaction(); + + std::unique_ptr binder( + new binder::BindNodeVisitor(txn, DEFAULT_DB_NAME)); + + // Get the first statement. + auto sql_statement1 = std::shared_ptr( + stmt_list1.get()->PassOutStatement(0)); + auto sql_statement2 = std::shared_ptr( + stmt_list2.get()->PassOutStatement(0)); + auto sql_statement3 = std::shared_ptr( + stmt_list3.get()->PassOutStatement(0)); + + binder->BindNameToNode(sql_statement1.get()); + binder->BindNameToNode(sql_statement2.get()); + binder->BindNameToNode(sql_statement3.get()); + + // Get the optimized plan tree without the indexes (sequential scan) + auto result1 = brain::WhatIfIndex::GetCostAndBestPlanTree( + sql_statement1, config, DEFAULT_DB_NAME, txn); + auto cost_without_index = result1->cost; + LOG_TRACE("Cost of the query without indexes {}: %lf", cost_without_index); + LOG_TRACE("%s", result1->plan->GetInfo().c_str()); + EXPECT_EQ(result1->plan->GetPlanNodeType(), PlanNodeType::SEQSCAN); + + // Insert hypothetical catalog objects + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"a"})); + + result1 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement1, config, + DEFAULT_DB_NAME, txn); + auto result2 = brain::WhatIfIndex::GetCostAndBestPlanTree( + sql_statement2, config, DEFAULT_DB_NAME, txn); + auto result3 = brain::WhatIfIndex::GetCostAndBestPlanTree( + sql_statement3, config, DEFAULT_DB_NAME, txn); + auto cost_with_index_1_1 = result1->cost; + auto cost_with_index_1_2 = result2->cost; + auto cost_with_index_1_3 = result3->cost; + LOG_TRACE("Cost of the query with index {'a'}: %lf", cost_with_index_1_1); + EXPECT_EQ(result1->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + EXPECT_GT(cost_without_index, cost_with_index_1_1); + EXPECT_DOUBLE_EQ(cost_with_index_1_1, cost_with_index_1_2); + EXPECT_DOUBLE_EQ(cost_with_index_1_2, cost_with_index_1_3); + + config.Clear(); + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"a", "b"})); + result1 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement1, config, + DEFAULT_DB_NAME, txn); + result2 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement2, config, + DEFAULT_DB_NAME, txn); + result3 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement3, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_2_1 = result1->cost; + auto cost_with_index_2_2 = result2->cost; + auto cost_with_index_2_3 = result3->cost; + EXPECT_EQ(result1->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with index {'a', 'b'}: %lf", + cost_with_index_2_1); + EXPECT_GT(cost_without_index, cost_with_index_2_1); + EXPECT_GT(cost_with_index_1_1, cost_with_index_2_1); + EXPECT_DOUBLE_EQ(cost_with_index_2_1, cost_with_index_2_2); + EXPECT_DOUBLE_EQ(cost_with_index_2_2, cost_with_index_2_3); + + config.Clear(); + config.AddIndexObject( + testing_util.CreateHypotheticalIndex(schema.table_name, {"a", "b", "c"})); + result1 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement1, config, + DEFAULT_DB_NAME, txn); + result2 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement2, config, + DEFAULT_DB_NAME, txn); + result3 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement3, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_3_1 = result1->cost; + auto cost_with_index_3_2 = result2->cost; + auto cost_with_index_3_3 = result3->cost; + EXPECT_EQ(result1->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with index {'a', 'b', 'c'}: %lf", + cost_with_index_3_1); + EXPECT_GT(cost_without_index, cost_with_index_3_1); + EXPECT_GT(cost_with_index_2_1, cost_with_index_3_1); + EXPECT_DOUBLE_EQ(cost_with_index_3_1, cost_with_index_3_2); + EXPECT_DOUBLE_EQ(cost_with_index_3_2, cost_with_index_3_3); + + config.Clear(); + config.AddIndexObject(testing_util.CreateHypotheticalIndex( + schema.table_name, {"a", "b", "c", "d"})); + result1 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement1, config, + DEFAULT_DB_NAME, txn); + result2 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement2, config, + DEFAULT_DB_NAME, txn); + result3 = brain::WhatIfIndex::GetCostAndBestPlanTree(sql_statement3, config, + DEFAULT_DB_NAME, txn); + auto cost_with_index_4_1 = result1->cost; + auto cost_with_index_4_2 = result2->cost; + auto cost_with_index_4_3 = result3->cost; + EXPECT_EQ(result1->plan->GetPlanNodeType(), PlanNodeType::INDEXSCAN); + LOG_TRACE("Cost of the query with index {'a', 'b', 'c', 'd'}: %lf", + cost_with_index_4_1); + EXPECT_GT(cost_without_index, cost_with_index_4_1); + EXPECT_GT(cost_with_index_3_1, cost_with_index_4_1); + EXPECT_DOUBLE_EQ(cost_with_index_4_1, cost_with_index_4_2); + EXPECT_DOUBLE_EQ(cost_with_index_4_2, cost_with_index_4_3); + + txn_manager.CommitTransaction(txn); +} + +} // namespace test +} // namespace peloton diff --git a/test/include/brain/testing_index_selection_util.h b/test/include/brain/testing_index_selection_util.h new file mode 100644 index 00000000000..f3dcbcad9d2 --- /dev/null +++ b/test/include/brain/testing_index_selection_util.h @@ -0,0 +1,132 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// testing_index_selection_util.h +// +// Identification: test/include/brain/testing_index_selection_util.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "brain/index_selection_util.h" +#include "brain/index_selection.h" + +namespace peloton { +namespace test { + +namespace index_selection { + +/** + * Table column type. + */ +enum TupleValueType { INTEGER, FLOAT, STRING }; + +/** + * Represents workload types used in the test cases. + */ +enum QueryStringsWorkloadType { A = 1, B = 2, C = 3, D = 4 }; + +/** + * Represents the schema for creating tables in the test cases. + */ +class TableSchema { + public: + std::vector> cols; + std::unordered_map col_offset_map; + std::string table_name; + + TableSchema(){}; + TableSchema(std::string table_name, + std::vector> columns) { + auto i = 0UL; + for (auto col : columns) { + cols.push_back(col); + col_offset_map[col.first] = i; + i++; + } + this->table_name = table_name; + } +}; + +/** + * Utility class for testing Index Selection (auto-index). + */ +class TestingIndexSelectionUtil { + public: + /** + * Creates a database. + * @param db_name + */ + TestingIndexSelectionUtil(std::string db_name); + + /** + * Drops all tables and the database. + */ + ~TestingIndexSelectionUtil(); + + /** + * Inserts specified number of tuples. + * @param schema schema of the table to be created + * @param num_tuples number of tuples to be inserted with random values. + */ + void InsertIntoTable(TableSchema schema, long num_tuples); + + /** + * Create a new table.s + * @param schema + */ + void CreateTable(TableSchema schema); + + /** + * Factory method to create a hypothetical index object. The returned object + * can be used in the catalog or catalog cache. + * @param table_name + * @param index_col_names + * @return + */ + std::shared_ptr CreateHypotheticalIndex( + std::string table_name, std::vector cols, + brain::IndexSelection *is = nullptr); + + /** + * Return a micro workload + * This function returns queries and the respective table schemas + * User of this function must create all of the returned tables. + * @param workload_type type of the workload to be returned + * @return workload query strings along with the table schema + */ + std::pair, std::vector> + GetQueryStringsWorkload(QueryStringsWorkloadType workload_type); + + private: + std::string database_name_; + std::unordered_map tables_created_; + + /** + * Create the database + */ + void CreateDatabase(); + + /** + * Drop the database + */ + void DropDatabase(); + + /** + * Drop the table + */ + void DropTable(std::string table_name); + + /** + * Generate stats for all the tables in the system. + */ + void GenerateTableStats(); +}; +} + +} // namespace test +} // namespace peloton diff --git a/test/sql/testing_sql_util.cpp b/test/sql/testing_sql_util.cpp index 220fa558686..c84484cb24f 100644 --- a/test/sql/testing_sql_util.cpp +++ b/test/sql/testing_sql_util.cpp @@ -120,7 +120,7 @@ ResultType TestingSQLUtil::ExecuteSQLQueryWithOptimizer( auto result_format = std::vector(tuple_descriptor.size(), 0); try { - LOG_DEBUG("\n%s", planner::PlanUtil::GetInfo(plan.get()).c_str()); + LOG_TRACE("\n%s", planner::PlanUtil::GetInfo(plan.get()).c_str()); // SetTrafficCopCounter(); counter_.store(1); auto status =