Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A0-4591: Added major sync metric back (although by different name) #1919

Merged
merged 5 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions bin/node/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ pub fn new_authority(

let backoff_authoring_blocks = Some(LimitNonfinalized(aleph_config.max_nonfinalized_blocks()));
let prometheus_registry = config.prometheus_registry().cloned();
let (sync_oracle, major_sync) = SyncOracle::new();
let sync_oracle = SyncOracle::new();
let proposer_factory = get_proposer_factory(&service_components, &config);
let slot_duration = sc_consensus_aura::slot_duration(&*service_components.client)?;
let (block_import, block_rx) = RedirectingBlockImport::new(service_components.client.clone());
Expand Down Expand Up @@ -311,7 +311,7 @@ pub fn new_authority(
network_config,
config.protocol_id(),
service_components.client.clone(),
major_sync,
sync_oracle.underlying_atomic(),
service_components.transaction_pool.clone(),
&service_components.task_manager.spawn_handle(),
config
Expand Down
9 changes: 0 additions & 9 deletions finality-aleph/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,13 @@ pallet-aleph-runtime-api = { workspace = true, features = ["std"] }

async-trait = { workspace = true }
array-bytes = { workspace = true }
bytes = { workspace = true }
derive_more = { workspace = true }
env_logger = { workspace = true }
futures = { workspace = true }
futures-timer = { workspace = true }
hash-db = { workspace = true }
hex = { workspace = true }
ip_network = { workspace = true }
log = { workspace = true }
lru = { workspace = true }
parity-scale-codec = { workspace = true, features = ["derive"] }
scale-info = { workspace = true, features = ["derive"] }
parking_lot = { workspace = true }
rand = { workspace = true }
serde = { workspace = true }
Expand All @@ -62,7 +57,6 @@ sc-network-light = { workspace = true }
sc-network-transactions = { workspace = true }
sc-rpc = { workspace = true }
sc-service = { workspace = true }
sc-telemetry = { workspace = true }
sc-transaction-pool = { workspace = true }
sc-transaction-pool-api = { workspace = true }
sc-utils = { workspace = true }
Expand All @@ -74,12 +68,9 @@ sp-consensus = { workspace = true }
sp-consensus-aura = { workspace = true }
sp-consensus-slots = { workspace = true }
sp-core = { workspace = true }
sp-io = { workspace = true }
sp-keystore = { workspace = true }
sp-runtime = { workspace = true }
sp-state-machine = { workspace = true }
sp-timestamp = { workspace = true }
sp-trie = { workspace = true }

[dev-dependencies]
substrate-test-runtime-client = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion finality-aleph/src/party/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ mod tests {
let readonly_session_authorities = shared_map.read_only();

let chain_state = Arc::new(MockChainState::new());
let (sync_oracle, _) = SyncOracle::new();
let sync_oracle = SyncOracle::new();
let session_manager = Arc::new(MockNodeSessionManager::new());
let session_info = SessionBoundaryInfo::new(session_period);

Expand Down
4 changes: 2 additions & 2 deletions finality-aleph/src/sync/handler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ mod tests {
let handler = Handler::new(
database_io,
verifier,
SyncOracle::new().0,
SyncOracle::new(),
SESSION_BOUNDARY_INFO,
)
.expect("mock backend works");
Expand Down Expand Up @@ -1718,7 +1718,7 @@ mod tests {
let mut handler = Handler::new(
database_io,
verifier,
SyncOracle::new().0,
SyncOracle::new(),
SessionBoundaryInfo::new(SessionPeriod(20)),
)
.expect("mock backend works");
Expand Down
48 changes: 45 additions & 3 deletions finality-aleph/src/sync/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
use std::collections::HashMap;
use std::{
collections::HashMap,
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
};

use substrate_prometheus_endpoint::{register, Counter, PrometheusError, Registry, U64};
use substrate_prometheus_endpoint::{
register, Counter, MetricSource, Opts, PrometheusError, Registry, SourcedGauge, U64,
};

#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum Event {
Expand Down Expand Up @@ -82,8 +90,39 @@ pub enum Metrics {
Noop,
}

#[derive(Clone)]
pub struct MajorSyncingGauge(Arc<AtomicBool>);

impl MajorSyncingGauge {
fn register(registry: &Registry, value: Arc<AtomicBool>) -> Result<(), PrometheusError> {
register(
SourcedGauge::new(
&Opts::new(
"aleph_sync_is_major_syncing",
"Whether the node is performing a major sync or not.",
),
MajorSyncingGauge(value),
)?,
registry,
)?;

Ok(())
}
}

impl MetricSource for MajorSyncingGauge {
type N = u64;

fn collect(&self, mut set: impl FnMut(&[&str], Self::N)) {
set(&[], self.0.load(Ordering::Relaxed) as u64);
}
}

impl Metrics {
pub fn new(registry: Option<Registry>) -> Result<Self, PrometheusError> {
pub fn new(
is_major_syncing: Arc<AtomicBool>,
registry: Option<Registry>,
) -> Result<Self, PrometheusError> {
let registry = match registry {
Some(registry) => registry,
None => return Ok(Metrics::Noop),
Expand Down Expand Up @@ -114,6 +153,9 @@ impl Metrics {
)?,
);
}

MajorSyncingGauge::register(&registry, is_major_syncing)?;

Ok(Metrics::Prometheus {
event_calls,
event_errors,
Expand Down
3 changes: 2 additions & 1 deletion finality-aleph/src/sync/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,13 @@ where
database_io,
} = io;
let network = VersionWrapper::new(network);
let is_major_syncing = sync_oracle.underlying_atomic();
let handler = Handler::new(database_io, verifier, sync_oracle, session_info)?;
let tasks = TaskQueue::new();
let broadcast_ticker = Ticker::new(TICK_PERIOD, BROADCAST_COOLDOWN);
let chain_extension_ticker = Ticker::new(TICK_PERIOD, CHAIN_EXTENSION_COOLDOWN);
let (block_requests_for_sync, block_requests_from_user) = mpsc::unbounded();
let metrics = Metrics::new(metrics_registry).unwrap_or_else(|e| {
let metrics = Metrics::new(is_major_syncing, metrics_registry).unwrap_or_else(|e| {
warn!(target: LOG_TARGET, "Failed to create metrics: {}.", e);
Metrics::noop()
});
Expand Down
16 changes: 8 additions & 8 deletions finality-aleph/src/sync_oracle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,24 @@ const OFFLINE_THRESHOLD: Duration = Duration::from_secs(6);
const FAR_BEHIND_THRESHOLD: u32 = 15;
const MAJOR_SYNC_THRESHOLD: Duration = Duration::from_secs(10);

pub type MajorSyncIndicator = Arc<AtomicBool>;

/// A sync oracle implementation tracking how recently the node was far behind the highest known justification.
/// It defines being in major sync as being more than 15 blocks behind the highest known justification less than 10 seconds ago.
/// It defines being offline as not getting any update for at least 6 seconds (or never at all).
#[derive(Clone)]
pub struct SyncOracle {
last_far_behind: Arc<Mutex<Instant>>,
last_update: Arc<Mutex<Instant>>,
// TODO: remove when SyncingService is no longer needed
is_major_syncing: Arc<AtomicBool>,
}

impl SyncOracle {
pub fn new() -> (Self, MajorSyncIndicator) {
pub fn new() -> Self {
let is_major_syncing = Arc::new(AtomicBool::new(true));
let oracle = SyncOracle {
SyncOracle {
last_update: Arc::new(Mutex::new(Instant::now() - OFFLINE_THRESHOLD)),
last_far_behind: Arc::new(Mutex::new(Instant::now())),
is_major_syncing: is_major_syncing.clone(),
};
(oracle, is_major_syncing)
}
}

pub fn update_behind(&self, behind: u32) {
Expand All @@ -53,11 +49,15 @@ impl SyncOracle {
.store(is_major_syncing, Ordering::Relaxed);
is_major_syncing
}

pub fn underlying_atomic(&self) -> Arc<AtomicBool> {
self.is_major_syncing.clone()
}
}

impl Default for SyncOracle {
fn default() -> Self {
SyncOracle::new().0
SyncOracle::new()
}
}

Expand Down