diff --git a/cloud/blockstore/libs/storage/partition_nonrepl/part_nonrepl_migration_ut.cpp b/cloud/blockstore/libs/storage/partition_nonrepl/part_nonrepl_migration_ut.cpp index 68019d98b3..133e8f6555 100644 --- a/cloud/blockstore/libs/storage/partition_nonrepl/part_nonrepl_migration_ut.cpp +++ b/cloud/blockstore/libs/storage/partition_nonrepl/part_nonrepl_migration_ut.cpp @@ -207,7 +207,8 @@ struct TTestEnv THashSet(), // laggingDeviceIds TDuration::Zero(), // maxTimedOutDeviceStateDuration false, // maxTimedOutDeviceStateDurationOverridden - false); + false // useSimpleMigrationBandwidthLimiter + ); auto part = std::make_unique( std::move(config), diff --git a/cloud/blockstore/libs/storage/protos/disk.proto b/cloud/blockstore/libs/storage/protos/disk.proto index ed54222fdb..5740506446 100644 --- a/cloud/blockstore/libs/storage/protos/disk.proto +++ b/cloud/blockstore/libs/storage/protos/disk.proto @@ -279,7 +279,7 @@ message TDiskConfig // A log of important events in the life of this disk. repeated TDiskHistoryItem History = 17; - // A list of devices that are lagging begind on writes. + // A list of devices that are lagging behind on writes. repeated TLaggingDevice LaggingDevices = 18; } @@ -436,13 +436,16 @@ message TLaggingAgent // Agent id. string AgentId = 1; + // Node that agent is running on. + uint32 NodeId = 2; + // Index of the mirror disk replica. // 0 - main devices // 1,2 - replica devices - uint32 ReplicaIndex = 2; + uint32 ReplicaIndex = 3; // A list of devices that belong to the agent. - repeated TLaggingDevice Devices = 3; + repeated TLaggingDevice Devices = 4; } //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/blockstore/libs/storage/volume/model/helpers.cpp b/cloud/blockstore/libs/storage/volume/model/helpers.cpp index e457eee6ee..d936f10644 100644 --- a/cloud/blockstore/libs/storage/volume/model/helpers.cpp +++ b/cloud/blockstore/libs/storage/volume/model/helpers.cpp @@ -81,16 +81,16 @@ const NProto::TDeviceConfig* FindDeviceConfig( return device.GetDeviceUUID() == deviceUUID; }; - const NProto::TDeviceConfig* timeoutedDeviceConfig = nullptr; - timeoutedDeviceConfig = FindIfPtr(meta.GetDevices(), deviceMatcher); - if (timeoutedDeviceConfig) { - return timeoutedDeviceConfig; + const NProto::TDeviceConfig* deviceConfig = nullptr; + deviceConfig = FindIfPtr(meta.GetDevices(), deviceMatcher); + if (deviceConfig) { + return deviceConfig; } for (const auto& replica: meta.GetReplicas()) { - timeoutedDeviceConfig = FindIfPtr(replica.GetDevices(), deviceMatcher); - if (timeoutedDeviceConfig) { - return timeoutedDeviceConfig; + deviceConfig = FindIfPtr(replica.GetDevices(), deviceMatcher); + if (deviceConfig) { + return deviceConfig; } } @@ -106,8 +106,9 @@ const NProto::TDeviceConfig* FindDeviceConfig( std::optional GetAgentDevicesIndexes( const NProto::TVolumeMeta& meta, ui32 agentNodeId, - TVector& laggingDevices) + TVector* laggingDevices) { + Y_DEBUG_ABORT_UNLESS(laggingDevices); std::optional replicaIndex; const RepeatedPtrField* replicaDevices = nullptr; const auto deviceMatcher = [agentNodeId](const NProto::TDeviceConfig& device) @@ -159,7 +160,7 @@ std::optional GetAgentDevicesIndexes( NProto::TLaggingDevice laggingDevice; laggingDevice.SetRowIndex(i); laggingDevice.SetDeviceUUID(device.GetDeviceUUID()); - laggingDevices.push_back(std::move(laggingDevice)); + laggingDevices->push_back(std::move(laggingDevice)); } } @@ -182,7 +183,7 @@ std::optional GetAgentDevicesIndexes( } laggingDevice.SetRowIndex(*sourceDeviceIndex); laggingDevice.SetDeviceUUID(targetDevice.GetDeviceUUID()); - laggingDevices.push_back(std::move(laggingDevice)); + laggingDevices->push_back(std::move(laggingDevice)); } } @@ -191,12 +192,12 @@ std::optional GetAgentDevicesIndexes( TSet ReplicaIndexesWithFreshDevices( const NProto::TVolumeMeta& meta, - NProto::TLaggingDevice device) + ui32 rowIndex) { TSet result; auto it = Find( meta.GetFreshDeviceIds(), - meta.GetDevices()[device.GetRowIndex()].GetDeviceUUID()); + meta.GetDevices()[rowIndex].GetDeviceUUID()); if (it != meta.GetFreshDeviceIds().end()) { result.insert(0); } @@ -205,7 +206,7 @@ TSet ReplicaIndexesWithFreshDevices( for (int i = 0; i < replicas.size(); i++) { auto it = Find( meta.GetFreshDeviceIds(), - replicas[i].GetDevices()[device.GetRowIndex()].GetDeviceUUID()); + replicas[i].GetDevices()[rowIndex].GetDeviceUUID()); if (it != meta.GetFreshDeviceIds().end()) { result.insert(i + 1); } @@ -215,7 +216,7 @@ TSet ReplicaIndexesWithFreshDevices( void RemoveLaggingDevicesFromMeta( NProto::TVolumeMeta& meta, - const TVector laggingDeviceIds) + const TVector& laggingDeviceIds) { for (auto& agent: *meta.MutableLaggingAgentsInfo()->MutableAgents()) { EraseIf( @@ -236,4 +237,34 @@ void RemoveLaggingDevicesFromMeta( } } +void UpdateLaggingDevicesAfterMetaUpdate(NProto::TVolumeMeta& meta) +{ + for (auto& agent: *meta.MutableLaggingAgentsInfo()->MutableAgents()) { + agent.ClearDevices(); + + TVector updatedLaggingDevices; + auto replicaIndex = GetAgentDevicesIndexes( + meta, + agent.GetNodeId(), + &updatedLaggingDevices); + if (!replicaIndex.has_value()) { + continue; + } + + Y_DEBUG_ABORT_UNLESS(*replicaIndex == agent.GetReplicaIndex()); + Y_DEBUG_ABORT_UNLESS(!updatedLaggingDevices.empty()); + for (auto& laggingDevice: updatedLaggingDevices) { + *agent.AddDevices() = std::move(laggingDevice); + } + } + + EraseIf( + *meta.MutableLaggingAgentsInfo()->MutableAgents(), + [](const NProto::TLaggingAgent& laggingAgent) + { return laggingAgent.GetDevices().empty(); }); + if (meta.GetLaggingAgentsInfo().GetAgents().empty()) { + meta.MutableLaggingAgentsInfo()->Clear(); + } +} + } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/volume/model/helpers.h b/cloud/blockstore/libs/storage/volume/model/helpers.h index 6c949f53b7..e7cbedebd6 100644 --- a/cloud/blockstore/libs/storage/volume/model/helpers.h +++ b/cloud/blockstore/libs/storage/volume/model/helpers.h @@ -19,18 +19,16 @@ struct TLaggingDeviceIndexCmp [[nodiscard]] std::optional GetAgentDevicesIndexes( const NProto::TVolumeMeta& meta, ui32 agentNodeId, - TVector& laggingDevices); + TVector* laggingDevices); [[nodiscard]] TSet ReplicaIndexesWithFreshDevices( const NProto::TVolumeMeta& meta, - NProto::TLaggingDevice device); - -[[nodiscard]] bool CheckReplicasPlacementAreCorrect( - const NProto::TVolumeMeta& meta, - ui32 agentNodeId); + ui32 rowIndex); void RemoveLaggingDevicesFromMeta( NProto::TVolumeMeta& meta, - const TVector laggingDeviceIds); + const TVector& laggingDeviceIds); + +void UpdateLaggingDevicesAfterMetaUpdate(NProto::TVolumeMeta& meta); } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/volume/volume_actor_allocatedisk.cpp b/cloud/blockstore/libs/storage/volume/volume_actor_allocatedisk.cpp index c28366519a..4c2b07c0a4 100644 --- a/cloud/blockstore/libs/storage/volume/volume_actor_allocatedisk.cpp +++ b/cloud/blockstore/libs/storage/volume/volume_actor_allocatedisk.cpp @@ -157,6 +157,7 @@ NProto::TVolumeMeta CreateNewMeta( newMeta.SetIOModeTs(args.IOModeTs.MicroSeconds()); newMeta.SetMuteIOErrors(args.MuteIOErrors); RemoveLaggingDevicesFromMeta(newMeta, args.RemovedLaggingDeviceIds); + UpdateLaggingDevicesAfterMetaUpdate(newMeta); return newMeta; } diff --git a/cloud/blockstore/libs/storage/volume/volume_actor_lagging_agents.cpp b/cloud/blockstore/libs/storage/volume/volume_actor_lagging_agents.cpp index 326f2c1761..f1e1b2add8 100644 --- a/cloud/blockstore/libs/storage/volume/volume_actor_lagging_agents.cpp +++ b/cloud/blockstore/libs/storage/volume/volume_actor_lagging_agents.cpp @@ -131,12 +131,12 @@ void TVolumeActor::HandleDeviceTimeouted( } TVector timeoutedAgentDevices; - const auto replicaIndex = GetAgentDevicesIndexes( + const auto timeoutedDeviceReplicaIndex = GetAgentDevicesIndexes( meta, timeoutedDeviceConfig->GetNodeId(), - timeoutedAgentDevices); + &timeoutedAgentDevices); Y_DEBUG_ABORT_UNLESS(!timeoutedAgentDevices.empty()); - Y_DEBUG_ABORT_UNLESS(replicaIndex); + Y_DEBUG_ABORT_UNLESS(timeoutedDeviceReplicaIndex); for (const auto& laggingAgent: meta.GetLaggingAgentsInfo().GetAgents()) { // Whether the agent is lagging already. @@ -155,7 +155,7 @@ void TVolumeActor::HandleDeviceTimeouted( ctx, State->GetDiskRegistryBasedPartitionActor(), std::make_unique( - *replicaIndex, + *timeoutedDeviceReplicaIndex, timeoutedDeviceConfig->GetAgentId())); auto response = @@ -186,9 +186,8 @@ void TVolumeActor::HandleDeviceTimeouted( TLaggingDeviceIndexCmp()); if (!rowIndexesIntersection.empty()) { - Y_DEBUG_ABORT_UNLESS( - laggingAgent.GetReplicaIndex() != replicaIndex); - + // TODO(komarevtsev-d): Allow source and target of the migration to + // lag at the same time. LOG_WARN( ctx, TBlockStoreComponents::VOLUME, @@ -213,9 +212,9 @@ void TVolumeActor::HandleDeviceTimeouted( // Check for fresh devices in the same row. for (const auto& laggingDevice: timeoutedAgentDevices) { TSet replicaIndexes = - ReplicaIndexesWithFreshDevices(meta, laggingDevice); + ReplicaIndexesWithFreshDevices(meta, laggingDevice.GetRowIndex()); const bool laggingDeviceIsFresh = - replicaIndexes.contains(*replicaIndex); + replicaIndexes.contains(*timeoutedDeviceReplicaIndex); if (replicaIndexes.size() - laggingDeviceIsFresh > 0) { LOG_WARN( ctx, @@ -239,7 +238,8 @@ void TVolumeActor::HandleDeviceTimeouted( NProto::TLaggingAgent unavailableAgent; unavailableAgent.SetAgentId(timeoutedDeviceConfig->GetAgentId()); - unavailableAgent.SetReplicaIndex(*replicaIndex); + unavailableAgent.SetNodeId(timeoutedDeviceConfig->GetNodeId()); + unavailableAgent.SetReplicaIndex(*timeoutedDeviceReplicaIndex); unavailableAgent.MutableDevices()->Assign( timeoutedAgentDevices.begin(), timeoutedAgentDevices.end()); diff --git a/cloud/blockstore/libs/storage/volume/volume_actor_updateconfig.cpp b/cloud/blockstore/libs/storage/volume/volume_actor_updateconfig.cpp index af3b0e8f5c..0f0ff89ada 100644 --- a/cloud/blockstore/libs/storage/volume/volume_actor_updateconfig.cpp +++ b/cloud/blockstore/libs/storage/volume/volume_actor_updateconfig.cpp @@ -249,6 +249,7 @@ void TVolumeActor::FinishUpdateVolumeConfig(const TActorContext& ctx) RemoveLaggingDevicesFromMeta( newMeta, UnfinishedUpdateVolumeConfig.RemovedLaggingDeviceIds); + UpdateLaggingDevicesAfterMetaUpdate(newMeta); } UnfinishedUpdateVolumeConfig.Devices = {}; diff --git a/cloud/blockstore/libs/storage/volume/volume_lagging_agent_ut.cpp b/cloud/blockstore/libs/storage/volume/volume_lagging_agent_ut.cpp index 0005a12e48..22adc96165 100644 --- a/cloud/blockstore/libs/storage/volume/volume_lagging_agent_ut.cpp +++ b/cloud/blockstore/libs/storage/volume/volume_lagging_agent_ut.cpp @@ -22,7 +22,7 @@ TVector MakeDeviceList(ui32 agentCount, ui32 deviceCount) for (ui32 i = 1; i <= agentCount; i++) { for (ui32 j = 0; j < deviceCount; j++) { auto device = MakeDevice( - Sprintf("uuid%u-%u", i, j), + Sprintf("uuid-%u.%u", i, j), Sprintf("dev%u", j), Sprintf("transport%u-%u", i, j)); device.SetNodeId(i - 1); @@ -77,20 +77,20 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) const auto& devices = stat->Record.GetVolume().GetDevices(); const auto& replicas = stat->Record.GetVolume().GetReplicas(); UNIT_ASSERT_VALUES_EQUAL(1, devices.size()); - UNIT_ASSERT_VALUES_EQUAL("uuid1-0", devices[0].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-1.0", devices[0].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-1", devices[0].GetAgentId()); UNIT_ASSERT_VALUES_EQUAL(2, replicas.size()); UNIT_ASSERT_VALUES_EQUAL(1, replicas[0].DevicesSize()); UNIT_ASSERT_VALUES_EQUAL( - "uuid2-0", + "uuid-2.0", replicas[0].GetDevices(0).GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL( "agent-2", replicas[0].GetDevices(0).GetAgentId()); UNIT_ASSERT_VALUES_EQUAL(1, replicas[1].DevicesSize()); UNIT_ASSERT_VALUES_EQUAL( - "uuid3-0", + "uuid-3.0", replicas[1].GetDevices(0).GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL( "agent-3", @@ -124,7 +124,7 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) }); // Device in the first replica is timeouted. - volume.DeviceTimeouted(0, "uuid2-0"); + volume.DeviceTimeouted(0, "uuid-2.0"); UNIT_ASSERT(addLaggingAgentRequest.has_value()); UNIT_ASSERT_VALUES_EQUAL( @@ -133,7 +133,7 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) UNIT_ASSERT_VALUES_EQUAL(1, addLaggingAgentRequest->ReplicaIndex); // Can't add more lagging devices in the same row. - volume.SendDeviceTimeoutedRequest(0, "uuid3-0"); + volume.SendDeviceTimeoutedRequest(0, "uuid-3.0"); auto response = volume.RecvDeviceTimeoutedResponse(); UNIT_ASSERT_VALUES_EQUAL( E_INVALID_STATE, @@ -147,7 +147,7 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) UNIT_ASSERT(!addLaggingAgentRequest.has_value()); // Now the zeroth replica can lag. - volume.DeviceTimeouted(0, "uuid1-0"); + volume.DeviceTimeouted(0, "uuid-1.0"); UNIT_ASSERT(addLaggingAgentRequest.has_value()); UNIT_ASSERT_VALUES_EQUAL( devices[0].GetAgentId(), @@ -192,31 +192,31 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) auto stat = volume.StatVolume(); const auto& devices = stat->Record.GetVolume().GetDevices(); UNIT_ASSERT_VALUES_EQUAL(3, devices.size()); - UNIT_ASSERT_VALUES_EQUAL("uuid1-0", devices[0].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-1.0", devices[0].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-1", devices[0].GetAgentId()); - UNIT_ASSERT_VALUES_EQUAL("uuid1-1", devices[1].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-1.1", devices[1].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-1", devices[1].GetAgentId()); - UNIT_ASSERT_VALUES_EQUAL("uuid4-0", devices[2].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-4.0", devices[2].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-4", devices[2].GetAgentId()); const auto& replicas = stat->Record.GetVolume().GetReplicas(); UNIT_ASSERT_VALUES_EQUAL(2, replicas.size()); const auto& replica1Devices = replicas[0].GetDevices(); UNIT_ASSERT_VALUES_EQUAL(3, replica1Devices.size()); - UNIT_ASSERT_VALUES_EQUAL("uuid2-0", replica1Devices[0].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-2.0", replica1Devices[0].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-2", replica1Devices[0].GetAgentId()); - UNIT_ASSERT_VALUES_EQUAL("uuid2-1", replica1Devices[1].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-2.1", replica1Devices[1].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-2", replica1Devices[1].GetAgentId()); - UNIT_ASSERT_VALUES_EQUAL("uuid5-0", replica1Devices[2].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-5.0", replica1Devices[2].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-5", replica1Devices[2].GetAgentId()); const auto& replica2Devices = replicas[1].GetDevices(); UNIT_ASSERT_VALUES_EQUAL(3, replica2Devices.size()); - UNIT_ASSERT_VALUES_EQUAL("uuid3-0", replica2Devices[0].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-3.0", replica2Devices[0].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-3", replica2Devices[0].GetAgentId()); - UNIT_ASSERT_VALUES_EQUAL("uuid3-1", replica2Devices[1].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-3.1", replica2Devices[1].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-3", replica2Devices[1].GetAgentId()); - UNIT_ASSERT_VALUES_EQUAL("uuid6-0", replica2Devices[2].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("uuid-6.0", replica2Devices[2].GetDeviceUUID()); UNIT_ASSERT_VALUES_EQUAL("agent-6", replica2Devices[2].GetAgentId()); std::optional @@ -245,7 +245,7 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) }); // Device in the zeroth replica is timeouted. - volume.DeviceTimeouted(1, "uuid1-1"); + volume.DeviceTimeouted(1, "uuid-1.1"); UNIT_ASSERT(addLaggingAgentRequest.has_value()); UNIT_ASSERT_VALUES_EQUAL( @@ -256,7 +256,7 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) { addLaggingAgentRequest.reset(); // The first agent is already lagging. - volume.SendDeviceTimeoutedRequest(0, "uuid1-0"); + volume.SendDeviceTimeoutedRequest(0, "uuid-1.0"); auto response = volume.RecvDeviceTimeoutedResponse(); UNIT_ASSERT_VALUES_EQUAL(S_ALREADY, response->GetError().GetCode()); UNIT_ASSERT(addLaggingAgentRequest.has_value()); @@ -268,7 +268,7 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) { // 0 and 1st rows already lagging. Can't add more lagging devices on // these rows. - volume.SendDeviceTimeoutedRequest(0, "uuid2-1"); + volume.SendDeviceTimeoutedRequest(0, "uuid-2.1"); auto response = volume.RecvDeviceTimeoutedResponse(); UNIT_ASSERT_VALUES_EQUAL( E_INVALID_STATE, @@ -277,7 +277,7 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) // Adding the second row to lagging. addLaggingAgentRequest.reset(); - volume.DeviceTimeouted(2, "uuid6-0"); + volume.DeviceTimeouted(2, "uuid-6.0"); UNIT_ASSERT(addLaggingAgentRequest.has_value()); UNIT_ASSERT_VALUES_EQUAL( replica2Devices[2].GetAgentId(), @@ -294,13 +294,13 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) 3, addLaggingDevicesRequest->GetLaggingDevices().size()); UNIT_ASSERT_VALUES_EQUAL( - "DeviceUUID: \"uuid1-0\"\n", + "DeviceUUID: \"uuid-1.0\"\n", addLaggingDevicesRequest->GetLaggingDevices(0).DebugString()); UNIT_ASSERT_VALUES_EQUAL( - "DeviceUUID: \"uuid1-1\"\nRowIndex: 1\n", + "DeviceUUID: \"uuid-1.1\"\nRowIndex: 1\n", addLaggingDevicesRequest->GetLaggingDevices(1).DebugString()); UNIT_ASSERT_VALUES_EQUAL( - "DeviceUUID: \"uuid6-0\"\nRowIndex: 2\n", + "DeviceUUID: \"uuid-6.0\"\nRowIndex: 2\n", addLaggingDevicesRequest->GetLaggingDevices(2).DebugString()); // Disk Registry will remove lagging devices on reallocation. @@ -372,7 +372,7 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) }); // Device in the zeroth replica is timeouted. - volume.DeviceTimeouted(1, "uuid1-1"); + volume.DeviceTimeouted(1, "uuid-1.1"); UNIT_ASSERT(!addLaggingDevicesRequest.has_value()); // Update volume config. @@ -408,6 +408,182 @@ Y_UNIT_TEST_SUITE(TLaggingAgentVolumeTest) .GetDevices() .size()); } + + Y_UNIT_TEST(ShouldHandleMigratingDevice) + { + constexpr ui32 AgentCount = 7; + constexpr ui32 DevicePerAgentCount = 2; + auto diskRegistryState = MakeIntrusive(); + diskRegistryState->Devices = + MakeDeviceList(AgentCount - 1, DevicePerAgentCount); + diskRegistryState->AllocateDiskReplicasOnDifferentNodes = true; + diskRegistryState->ReplicaCount = 2; + diskRegistryState->MigrationMode = EMigrationMode::InProgress; + + // Add migration device. + { + auto device = MakeDevice( + "uuid-migration", + "dev-migration", + "transport-migration"); + device.SetNodeId(AgentCount - 1); + device.SetAgentId(Sprintf("agent-%u", AgentCount)); + diskRegistryState->MigrationDevices["uuid-1.0"] = device; + diskRegistryState->Devices.push_back(device); + } + + TVector agentStates; + for (ui32 i = 0; i < AgentCount; i++) { + agentStates.push_back(TDiskAgentStatePtr{}); + } + auto runtime = PrepareTestActorRuntime( + {}, + diskRegistryState, + {}, + {}, + std::move(agentStates)); + + TVolumeClient volume(*runtime); + const ui64 blockCount = DefaultDeviceBlockCount * + DefaultDeviceBlockSize / DefaultBlockSize * 3; + volume.UpdateVolumeConfig( + 0, + 0, + 0, + 0, + false, + 1, // version + NCloud::NProto::STORAGE_MEDIA_SSD_MIRROR3, + blockCount); + volume.WaitReady(); + + auto stat = volume.StatVolume(); + const auto& devices = stat->Record.GetVolume().GetDevices(); + UNIT_ASSERT_VALUES_EQUAL(3, devices.size()); + UNIT_ASSERT_VALUES_EQUAL("uuid-1.0", devices[0].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("agent-1", devices[0].GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL("uuid-1.1", devices[1].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("agent-1", devices[1].GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL("uuid-4.0", devices[2].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("agent-4", devices[2].GetAgentId()); + + const auto& replicas = stat->Record.GetVolume().GetReplicas(); + UNIT_ASSERT_VALUES_EQUAL(2, replicas.size()); + const auto& replica1Devices = replicas[0].GetDevices(); + UNIT_ASSERT_VALUES_EQUAL(3, replica1Devices.size()); + UNIT_ASSERT_VALUES_EQUAL( + "uuid-2.0", + replica1Devices[0].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("agent-2", replica1Devices[0].GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL( + "uuid-2.1", + replica1Devices[1].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("agent-2", replica1Devices[1].GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL( + "uuid-5.0", + replica1Devices[2].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("agent-5", replica1Devices[2].GetAgentId()); + + const auto& replica2Devices = replicas[1].GetDevices(); + UNIT_ASSERT_VALUES_EQUAL(3, replica2Devices.size()); + UNIT_ASSERT_VALUES_EQUAL( + "uuid-3.0", + replica2Devices[0].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("agent-3", replica2Devices[0].GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL( + "uuid-3.1", + replica2Devices[1].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("agent-3", replica2Devices[1].GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL( + "uuid-6.0", + replica2Devices[2].GetDeviceUUID()); + UNIT_ASSERT_VALUES_EQUAL("agent-6", replica2Devices[2].GetAgentId()); + + const auto& migrations = stat->Record.GetVolume().GetMigrations(); + UNIT_ASSERT_VALUES_EQUAL(1, migrations.size()); + UNIT_ASSERT_VALUES_EQUAL("uuid-1.0", migrations[0].GetSourceDeviceId()); + UNIT_ASSERT_VALUES_EQUAL( + "uuid-migration", + migrations[0].GetTargetDevice().GetDeviceUUID()); + + std::optional + addLaggingAgentRequest; + std::optional + addLaggingDevicesRequest; + runtime->SetEventFilter( + [&](TTestActorRuntimeBase&, TAutoPtr& event) + { + switch (event->GetTypeRewrite()) { + case TEvPartition::EvAddLaggingAgentRequest: { + auto* msg = event->Get< + TEvPartition::TEvAddLaggingAgentRequest>(); + UNIT_ASSERT(!addLaggingAgentRequest.has_value()); + addLaggingAgentRequest = *msg; + return true; + } + case TEvDiskRegistry::EvAddLaggingDevicesRequest: { + auto* msg = event->Get< + TEvDiskRegistry::TEvAddLaggingDevicesRequest>(); + addLaggingDevicesRequest = msg->Record; + break; + } + } + return false; + }); + + // Device in the zeroth replica is timeouted. + volume.DeviceTimeouted(0, "uuid-migration"); + + UNIT_ASSERT(addLaggingAgentRequest.has_value()); + UNIT_ASSERT_VALUES_EQUAL("agent-7", addLaggingAgentRequest->AgentId); + UNIT_ASSERT_VALUES_EQUAL(0, addLaggingAgentRequest->ReplicaIndex); + + { + addLaggingAgentRequest.reset(); + // The zeroth row is already lagging. + volume.SendDeviceTimeoutedRequest(0, "uuid-1.0"); + auto response = volume.RecvDeviceTimeoutedResponse(); + UNIT_ASSERT_VALUES_EQUAL( + E_INVALID_STATE, + response->GetError().GetCode()); + UNIT_ASSERT(!addLaggingAgentRequest.has_value()); + } + + // Adding the second row to lagging. + addLaggingAgentRequest.reset(); + volume.DeviceTimeouted(2, "uuid-6.0"); + UNIT_ASSERT(addLaggingAgentRequest.has_value()); + UNIT_ASSERT_VALUES_EQUAL( + replica2Devices[2].GetAgentId(), + addLaggingAgentRequest->AgentId); + + // Rebooting the volume tablet should report lagging devices to the DR. + UNIT_ASSERT(!addLaggingDevicesRequest.has_value()); + volume.RebootTablet(); + runtime->DispatchEvents({}, TDuration::Seconds(1)); + UNIT_ASSERT(addLaggingDevicesRequest.has_value()); + + UNIT_ASSERT_VALUES_EQUAL("vol0", addLaggingDevicesRequest->GetDiskId()); + UNIT_ASSERT_VALUES_EQUAL( + 2, + addLaggingDevicesRequest->GetLaggingDevices().size()); + UNIT_ASSERT_VALUES_EQUAL( + "DeviceUUID: \"uuid-migration\"\n", + addLaggingDevicesRequest->GetLaggingDevices(0).DebugString()); + UNIT_ASSERT_VALUES_EQUAL( + "DeviceUUID: \"uuid-6.0\"\nRowIndex: 2\n", + addLaggingDevicesRequest->GetLaggingDevices(1).DebugString()); + + // Disk Registry will remove lagging devices on reallocation. + volume.ReallocateDisk(); + auto metaHistoryResponse = volume.ReadMetaHistory(); + UNIT_ASSERT(!metaHistoryResponse->MetaHistory.empty()); + UNIT_ASSERT_VALUES_EQUAL( + 0, + metaHistoryResponse->MetaHistory.back() + .Meta.GetLaggingAgentsInfo() + .AgentsSize()); + } } } // namespace NCloud::NBlockStore::NStorage