diff --git a/ydb/core/kqp/compute_actor/kqp_compute_actor_factory.cpp b/ydb/core/kqp/compute_actor/kqp_compute_actor_factory.cpp index 59b2ef6fe181..76fe3c6d0ed3 100644 --- a/ydb/core/kqp/compute_actor/kqp_compute_actor_factory.cpp +++ b/ydb/core/kqp/compute_actor/kqp_compute_actor_factory.cpp @@ -59,8 +59,7 @@ struct TMemoryQuotaManager : public NYql::NDq::TGuaranteeQuotaManager { } TString MemoryConsumptionDetails() const override { - // NOTE: don't forget to disable verbosity in stable branches. - return Tx->ToString(true); + return Tx->ToString(); } void TerminateHandler(bool success, const NYql::TIssues& issues) { diff --git a/ydb/core/kqp/executer_actor/kqp_data_executer.cpp b/ydb/core/kqp/executer_actor/kqp_data_executer.cpp index cbd154485695..9d9d7340fd46 100644 --- a/ydb/core/kqp/executer_actor/kqp_data_executer.cpp +++ b/ydb/core/kqp/executer_actor/kqp_data_executer.cpp @@ -145,6 +145,7 @@ class TKqpDataExecuter : public TKqpExecuterBasePlanExecution(); @@ -3045,6 +3047,7 @@ class TKqpDataExecuter : public TKqpExecuterBase ArrayBufferMinFillPercentage; + const bool VerboseMemoryLimitException; }; } // namespace diff --git a/ydb/core/kqp/executer_actor/kqp_planner.cpp b/ydb/core/kqp/executer_actor/kqp_planner.cpp index a126f30eeae9..ad6fed383ec8 100644 --- a/ydb/core/kqp/executer_actor/kqp_planner.cpp +++ b/ydb/core/kqp/executer_actor/kqp_planner.cpp @@ -112,6 +112,7 @@ TKqpPlanner::TKqpPlanner(TKqpPlanner::TArgs&& args) , CaFactory_(args.CaFactory_) , BlockTrackingMode(args.BlockTrackingMode) , ArrayBufferMinFillPercentage(args.ArrayBufferMinFillPercentage) + , VerboseMemoryLimitException(args.VerboseMemoryLimitException) { if (GUCSettings) { SerializedGUCSettings = GUCSettings->SerializeToString(); @@ -479,7 +480,7 @@ TString TKqpPlanner::ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize) TxInfo = MakeIntrusive( TxId, TInstant::Now(), ResourceManager_->GetCounters(), - UserRequestContext->PoolId, memoryPoolPercent, Database); + UserRequestContext->PoolId, memoryPoolPercent, Database, VerboseMemoryLimitException); } if (ArrayBufferMinFillPercentage) { diff --git a/ydb/core/kqp/executer_actor/kqp_planner.h b/ydb/core/kqp/executer_actor/kqp_planner.h index 9297002b7456..39db0c6ff10a 100644 --- a/ydb/core/kqp/executer_actor/kqp_planner.h +++ b/ydb/core/kqp/executer_actor/kqp_planner.h @@ -71,6 +71,7 @@ class TKqpPlanner { const std::shared_ptr& CaFactory_; const NKikimrConfig::TTableServiceConfig::EBlockTrackingMode BlockTrackingMode; const TMaybe ArrayBufferMinFillPercentage; + const bool VerboseMemoryLimitException; }; TKqpPlanner(TKqpPlanner::TArgs&& args); @@ -150,6 +151,7 @@ class TKqpPlanner { TVector LastStats; const NKikimrConfig::TTableServiceConfig::EBlockTrackingMode BlockTrackingMode; const TMaybe ArrayBufferMinFillPercentage; + const bool VerboseMemoryLimitException; public: static bool UseMockEmptyPlanner; // for tests: if true then use TKqpMockEmptyPlanner that leads to the error diff --git a/ydb/core/kqp/executer_actor/kqp_scan_executer.cpp b/ydb/core/kqp/executer_actor/kqp_scan_executer.cpp index 1151910b9460..7cb5e3824a33 100644 --- a/ydb/core/kqp/executer_actor/kqp_scan_executer.cpp +++ b/ydb/core/kqp/executer_actor/kqp_scan_executer.cpp @@ -314,8 +314,9 @@ class TKqpScanExecuter : public TKqpExecuterBase { TIntrusivePtr txInfo = MakeIntrusive( txId, TInstant::Now(), ResourceManager_->GetCounters(), msg.GetSchedulerGroup(), msg.GetMemoryPoolPercent(), - msg.GetDatabase()); + msg.GetDatabase(), Config.GetVerboseMemoryLimitException()); const ui32 tasksCount = msg.GetTasks().size(); for (auto& dqTask: *msg.MutableTasks()) { diff --git a/ydb/core/kqp/rm_service/kqp_rm_service.h b/ydb/core/kqp/rm_service/kqp_rm_service.h index bba6426b8aad..e02f116c5dee 100644 --- a/ydb/core/kqp/rm_service/kqp_rm_service.h +++ b/ydb/core/kqp/rm_service/kqp_rm_service.h @@ -118,39 +118,48 @@ class TTxState : public TAtomicRefCount { const TString PoolId; const double MemoryPoolPercent; const TString Database; + const bool CollectBacktrace; private: std::atomic TxScanQueryMemory = 0; std::atomic TxExternalDataQueryMemory = 0; std::atomic TxExecutionUnits = 0; - std::atomic TxMaxAllocation = 0; - std::atomic TxFailedAllocation = 0; + std::atomic TxMaxAllocationSize = 0; // TODO(ilezhankin): it's better to use std::atomic> which is not supported at the moment. std::atomic TxMaxAllocationBacktrace = nullptr; - std::atomic TxFailedAllocationBacktrace = nullptr; + + // NOTE: it's hard to maintain atomic pointer in case of tracking the last failed allocation backtrace, + // because while we try to print one - the new last may emerge and delete previous. + mutable std::mutex BacktraceMutex; + std::atomic TxFailedAllocationSize = 0; // protected by BacktraceMutex (only if CollectBacktrace == true) + TBackTrace TxFailedAllocationBacktrace; // protected by BacktraceMutex + std::atomic HasFailedAllocationBacktrace = false; public: explicit TTxState(ui64 txId, TInstant now, TIntrusivePtr counters, const TString& poolId, const double memoryPoolPercent, - const TString& database) + const TString& database, bool collectBacktrace) : TxId(txId) , CreatedAt(now) , Counters(std::move(counters)) , PoolId(poolId) , MemoryPoolPercent(memoryPoolPercent) , Database(database) + , CollectBacktrace(collectBacktrace) {} ~TTxState() { delete TxMaxAllocationBacktrace.load(); - delete TxFailedAllocationBacktrace.load(); } std::pair MakePoolId() const { return std::make_pair(Database, PoolId); } - TString ToString(bool verbose = false) const { + TString ToString() const { + // use unique_lock to safely unlock mutex in case of exceptions + std::unique_lock backtraceLock(BacktraceMutex, std::defer_lock); + auto res = TStringBuilder() << "TxResourcesInfo { " << "TxId: " << TxId << ", Database: " << Database; @@ -160,19 +169,28 @@ class TTxState : public TAtomicRefCount { << ", MemoryPoolPercent: " << Sprintf("%.2f", MemoryPoolPercent > 0 ? MemoryPoolPercent : 100); } + if (CollectBacktrace) { + backtraceLock.lock(); + } + res << ", tx initially granted memory: " << HumanReadableSize(TxExternalDataQueryMemory.load(), SF_BYTES) << ", tx total memory allocations: " << HumanReadableSize(TxScanQueryMemory.load(), SF_BYTES) - << ", tx largest successful memory allocation: " << HumanReadableSize(TxMaxAllocation.load(), SF_BYTES) - << ", tx last failed memory allocation: " << HumanReadableSize(TxFailedAllocation.load(), SF_BYTES) + << ", tx largest successful memory allocation: " << HumanReadableSize(TxMaxAllocationSize.load(), SF_BYTES) + << ", tx last failed memory allocation: " << HumanReadableSize(TxFailedAllocationSize.load(), SF_BYTES) << ", tx total execution units: " << TxExecutionUnits.load() << ", started at: " << CreatedAt << " }" << Endl; - if (verbose && TxMaxAllocationBacktrace.load()) { - res << "TxMaxAllocationBacktrace:" << Endl << TxMaxAllocationBacktrace.load()->PrintToString(); + if (CollectBacktrace && HasFailedAllocationBacktrace.load()) { + res << "TxFailedAllocationBacktrace:" << Endl << TxFailedAllocationBacktrace.PrintToString(); + } + + if (CollectBacktrace) { + backtraceLock.unlock(); } - if (verbose && TxFailedAllocationBacktrace.load()) { - res << "TxFailedAllocationBacktrace:" << Endl << TxFailedAllocationBacktrace.load()->PrintToString(); + + if (CollectBacktrace && TxMaxAllocationBacktrace.load()) { + res << "TxMaxAllocationBacktrace:" << Endl << TxMaxAllocationBacktrace.load()->PrintToString(); } return res; @@ -183,22 +201,19 @@ class TTxState : public TAtomicRefCount { } void AckFailedMemoryAlloc(ui64 memory) { - auto* oldBacktrace = TxFailedAllocationBacktrace.load(); - ui64 lastAlloc = TxFailedAllocation.load(); - bool exchanged = false; + // use unique_lock to safely unlock mutex in case of exceptions + std::unique_lock backtraceLock(BacktraceMutex, std::defer_lock); - while(!exchanged) { - exchanged = TxFailedAllocation.compare_exchange_weak(lastAlloc, memory); + if (CollectBacktrace) { + backtraceLock.lock(); } - if (exchanged) { - auto* newBacktrace = new TBackTrace(); - newBacktrace->Capture(); - if (TxFailedAllocationBacktrace.compare_exchange_strong(oldBacktrace, newBacktrace)) { - delete oldBacktrace; - } else { - delete newBacktrace; - } + TxFailedAllocationSize = memory; + + if (CollectBacktrace) { + TxFailedAllocationBacktrace.Capture(); + HasFailedAllocationBacktrace = true; + backtraceLock.unlock(); } } @@ -239,17 +254,18 @@ class TTxState : public TAtomicRefCount { } auto* oldBacktrace = TxMaxAllocationBacktrace.load(); - ui64 maxAlloc = TxMaxAllocation.load(); + ui64 maxAlloc = TxMaxAllocationSize.load(); bool exchanged = false; while(maxAlloc < resources.Memory && !exchanged) { - exchanged = TxMaxAllocation.compare_exchange_weak(maxAlloc, resources.Memory); + exchanged = TxMaxAllocationSize.compare_exchange_weak(maxAlloc, resources.Memory); } if (exchanged) { auto* newBacktrace = new TBackTrace(); newBacktrace->Capture(); if (TxMaxAllocationBacktrace.compare_exchange_strong(oldBacktrace, newBacktrace)) { + // XXX(ilezhankin): technically it's possible to have a race with `ToString()`, but it's very unlikely. delete oldBacktrace; } else { delete newBacktrace; diff --git a/ydb/core/kqp/rm_service/kqp_rm_ut.cpp b/ydb/core/kqp/rm_service/kqp_rm_ut.cpp index 07b6c28336eb..b7875a7b39bd 100644 --- a/ydb/core/kqp/rm_service/kqp_rm_ut.cpp +++ b/ydb/core/kqp/rm_service/kqp_rm_ut.cpp @@ -184,7 +184,7 @@ class KqpRm : public TTestBase { } TIntrusivePtr MakeTx(ui64 txId, std::shared_ptr rm) { - return MakeIntrusive(txId, TInstant::Now(), rm->GetCounters(), "", (double)100, ""); + return MakeIntrusive(txId, TInstant::Now(), rm->GetCounters(), "", (double)100, "", false); } TIntrusivePtr MakeTask(ui64 taskId, TIntrusivePtr tx) { diff --git a/ydb/core/protos/table_service_config.proto b/ydb/core/protos/table_service_config.proto index b3db9d28b608..890b9ecd3816 100644 --- a/ydb/core/protos/table_service_config.proto +++ b/ydb/core/protos/table_service_config.proto @@ -51,6 +51,8 @@ message TTableServiceConfig { optional uint64 MaxNonParallelTopStageExecutionLimit = 26 [default = 1]; optional bool PreferLocalDatacenterExecution = 27 [ default = true ]; optional uint64 MaxNonParallelDataQueryTasksLimit = 28 [default = 1000]; + + optional bool VerboseMemoryLimitException = 29 [default = false]; } message TSpillingServiceConfig {