diff --git a/CHANGELOG.md b/CHANGELOG.md index ddd3ab067..544ed2851 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ - Replaced NTX Builder's in-memory state management with SQLite-backed persistence; account states, notes, and transaction effects are now stored in the database and inflight state is purged on startup ([#1662](https://github.com/0xMiden/miden-node/pull/1662)). - [BREAKING] Reworked `miden-remote-prover`, removing the `worker`/`proxy` distinction and simplifying to a `worker` with a request queue ([#1688](https://github.com/0xMiden/miden-node/pull/1688)). - NTX Builder actors now deactivate after being idle for a configurable idle timeout (`--ntx-builder.idle-timeout`, default 5 min) and are re-activated when new notes target their account ([#1705](https://github.com/0xMiden/miden-node/pull/1705)). +- NTX Builder now blacklists network accounts whose actors crash repeatedly (configurable via `--ntx-builder.max-actor-crashes`, default 10) ([#1712](https://github.com/0xMiden/miden-node/pull/1712)). ## v0.13.7 (2026-02-25) diff --git a/bin/node/src/commands/mod.rs b/bin/node/src/commands/mod.rs index 6a63f1ed2..2b49b10dc 100644 --- a/bin/node/src/commands/mod.rs +++ b/bin/node/src/commands/mod.rs @@ -138,6 +138,13 @@ pub struct NtxBuilderConfig { )] pub idle_timeout: Duration, + /// Maximum number of crashes before an account actor is blacklisted. + /// + /// Once an actor for a given account exceeds this crash count, no new actor will be + /// spawned for that account. + #[arg(long = "ntx-builder.max-actor-crashes", default_value_t = 10, value_name = "NUM")] + pub max_actor_crashes: usize, + /// Directory for the ntx-builder's persistent database. /// /// If not set, defaults to the node's data directory. @@ -169,6 +176,7 @@ impl NtxBuilderConfig { .with_tx_prover_url(self.tx_prover_url) .with_script_cache_size(self.script_cache_size) .with_idle_timeout(self.idle_timeout) + .with_max_actor_crashes(self.max_actor_crashes) } } diff --git a/crates/ntx-builder/src/coordinator.rs b/crates/ntx-builder/src/coordinator.rs index 175e04915..f5cf34555 100644 --- a/crates/ntx-builder/src/coordinator.rs +++ b/crates/ntx-builder/src/coordinator.rs @@ -95,16 +95,29 @@ pub struct Coordinator { /// Database for persistent state. db: Db, + + /// Tracks the number of crashes per account actor. + /// + /// When an actor shuts down due to a DB error, its crash count is incremented. Once + /// the count reaches `max_actor_crashes`, the account is blacklisted and no new actor + /// will be spawned for it. + crash_counts: HashMap, + + /// Maximum number of crashes an account actor is allowed before being blacklisted. + max_actor_crashes: usize, } impl Coordinator { - /// Creates a new coordinator with the specified maximum number of inflight transactions. - pub fn new(max_inflight_transactions: usize, db: Db) -> Self { + /// Creates a new coordinator with the specified maximum number of inflight transactions + /// and the crash threshold for account blacklisting. + pub fn new(max_inflight_transactions: usize, max_actor_crashes: usize, db: Db) -> Self { Self { actor_registry: HashMap::new(), actor_join_set: JoinSet::new(), semaphore: Arc::new(Semaphore::new(max_inflight_transactions)), db, + crash_counts: HashMap::new(), + max_actor_crashes, } } @@ -117,6 +130,18 @@ impl Coordinator { pub fn spawn_actor(&mut self, origin: AccountOrigin, actor_context: &AccountActorContext) { let account_id = origin.id(); + // Skip spawning if the account has been blacklisted due to repeated crashes. + if let Some(&count) = self.crash_counts.get(&account_id) { + if count >= self.max_actor_crashes { + tracing::warn!( + %account_id, + crash_count = count, + "Account blacklisted due to repeated crashes, skipping actor spawn" + ); + return; + } + } + // If an actor already exists for this account ID, something has gone wrong. if let Some(handle) = self.actor_registry.remove(&account_id) { tracing::error!( @@ -173,7 +198,13 @@ impl Coordinator { }, ActorShutdownReason::SemaphoreFailed(err) => Err(err).context("semaphore failed"), ActorShutdownReason::DbError(account_id) => { - tracing::error!(account_id = %account_id, "Account actor shut down due to DB error"); + let count = self.crash_counts.entry(account_id).or_insert(0); + *count += 1; + tracing::error!( + %account_id, + crash_count = *count, + "Account actor shut down due to DB error" + ); Ok(None) }, ActorShutdownReason::IdleTimeout(account_id) => { @@ -305,17 +336,55 @@ impl Coordinator { #[cfg(test)] mod tests { + use std::num::NonZeroUsize; + use std::sync::Arc; + use std::time::Duration; + use miden_node_proto::domain::mempool::MempoolEvent; use miden_node_proto::domain::note::NetworkNote; + use miden_node_utils::lru_cache::LruCache; + use tokio::sync::{RwLock, mpsc}; + use url::Url; use super::*; + use crate::actor::{AccountActorContext, AccountOrigin}; + use crate::chain_state::ChainState; + use crate::clients::StoreClient; use crate::db::Db; use crate::test_utils::*; /// Creates a coordinator with default settings backed by a temp DB. async fn test_coordinator() -> (Coordinator, tempfile::TempDir) { let (db, dir) = Db::test_setup().await; - (Coordinator::new(4, db), dir) + (Coordinator::new(4, 10, db), dir) + } + + /// Creates a minimal `AccountActorContext` suitable for unit tests. + /// + /// The URLs are fake and actors spawned with this context will fail on their first gRPC call, + /// but this is sufficient for testing coordinator logic (registry, blacklisting, etc.). + fn test_actor_context(db: &Db) -> AccountActorContext { + use miden_protocol::crypto::merkle::mmr::{Forest, MmrPeaks, PartialMmr}; + + let url = Url::parse("http://127.0.0.1:1").unwrap(); + let block_header = mock_block_header(0_u32.into()); + let chain_mmr = PartialMmr::from_peaks(MmrPeaks::new(Forest::new(0), vec![]).unwrap()); + let chain_state = Arc::new(RwLock::new(ChainState::new(block_header, chain_mmr))); + let (request_tx, _request_rx) = mpsc::channel(1); + + AccountActorContext { + block_producer_url: url.clone(), + validator_url: url.clone(), + tx_prover_url: None, + chain_state, + store: StoreClient::new(url), + script_cache: LruCache::new(NonZeroUsize::new(1).unwrap()), + max_notes_per_tx: NonZeroUsize::new(1).unwrap(), + max_note_attempts: 1, + idle_timeout: Duration::from_secs(60), + db: db.clone(), + request_tx, + } } /// Registers a dummy actor handle (no real actor task) in the coordinator's registry. @@ -358,4 +427,47 @@ mod tests { assert_eq!(inactive_targets.len(), 1); assert_eq!(inactive_targets[0], inactive_id); } + + // BLACKLIST TESTS + // ============================================================================================ + + #[tokio::test] + async fn spawn_actor_skips_blacklisted_account() { + let (db, _dir) = Db::test_setup().await; + let max_crashes = 3; + let mut coordinator = Coordinator::new(4, max_crashes, db.clone()); + let actor_context = test_actor_context(&db); + + let account_id = mock_network_account_id(); + + // Simulate the account having reached the crash threshold. + coordinator.crash_counts.insert(account_id, max_crashes); + + coordinator.spawn_actor(AccountOrigin::Store(account_id), &actor_context); + + assert!( + !coordinator.actor_registry.contains_key(&account_id), + "Blacklisted account should not have an actor in the registry" + ); + } + + #[tokio::test] + async fn spawn_actor_allows_below_threshold() { + let (db, _dir) = Db::test_setup().await; + let max_crashes = 3; + let mut coordinator = Coordinator::new(4, max_crashes, db.clone()); + let actor_context = test_actor_context(&db); + + let account_id = mock_network_account_id(); + + // Set crash count below the threshold. + coordinator.crash_counts.insert(account_id, max_crashes - 1); + + coordinator.spawn_actor(AccountOrigin::Store(account_id), &actor_context); + + assert!( + coordinator.actor_registry.contains_key(&account_id), + "Account below crash threshold should have an actor in the registry" + ); + } } diff --git a/crates/ntx-builder/src/lib.rs b/crates/ntx-builder/src/lib.rs index f91a6a3e8..4c20ec782 100644 --- a/crates/ntx-builder/src/lib.rs +++ b/crates/ntx-builder/src/lib.rs @@ -59,6 +59,9 @@ const DEFAULT_SCRIPT_CACHE_SIZE: NonZeroUsize = /// Default duration after which an idle network account actor will deactivate. const DEFAULT_IDLE_TIMEOUT: Duration = Duration::from_secs(5 * 60); +/// Default maximum number of crashes an account actor is allowed before being blacklisted. +const DEFAULT_MAX_ACTOR_CRASHES: usize = 10; + // CONFIGURATION // ================================================================================================= @@ -106,6 +109,13 @@ pub struct NtxBuilderConfig { /// A deactivated account will reactivate if targeted with new notes. pub idle_timeout: Duration, + /// Maximum number of crashes an account actor is allowed before being blacklisted. + /// + /// Once an actor for a given account exceeds this crash count, no new actor will be + /// spawned for that account. This prevents resource exhaustion from repeatedly failing + /// actors. + pub max_actor_crashes: usize, + /// Path to the SQLite database file used for persistent state. pub database_filepath: PathBuf, } @@ -129,6 +139,7 @@ impl NtxBuilderConfig { max_block_count: DEFAULT_MAX_BLOCK_COUNT, account_channel_capacity: DEFAULT_ACCOUNT_CHANNEL_CAPACITY, idle_timeout: DEFAULT_IDLE_TIMEOUT, + max_actor_crashes: DEFAULT_MAX_ACTOR_CRASHES, database_filepath, } } @@ -203,6 +214,13 @@ impl NtxBuilderConfig { self } + /// Sets the maximum number of crashes before an account actor is blacklisted. + #[must_use] + pub fn with_max_actor_crashes(mut self, max: usize) -> Self { + self.max_actor_crashes = max; + self + } + /// Builds and initializes the network transaction builder. /// /// This method connects to the store and block producer services, fetches the current @@ -222,7 +240,8 @@ impl NtxBuilderConfig { db.purge_inflight().await.context("failed to purge inflight state")?; let script_cache = LruCache::new(self.script_cache_size); - let coordinator = Coordinator::new(self.max_concurrent_txs, db.clone()); + let coordinator = + Coordinator::new(self.max_concurrent_txs, self.max_actor_crashes, db.clone()); let store = StoreClient::new(self.store_url.clone()); let block_producer = BlockProducerClient::new(self.block_producer_url.clone()); diff --git a/docs/external/src/operator/architecture.md b/docs/external/src/operator/architecture.md index 674507752..f5c96f25a 100644 --- a/docs/external/src/operator/architecture.md +++ b/docs/external/src/operator/architecture.md @@ -58,3 +58,7 @@ Internally, the builder spawns a dedicated actor for each network account that h idle (no notes to consume) for a configurable duration are automatically deactivated to conserve resources, and are re-activated when new notes arrive. The idle timeout can be tuned with the `--ntx-builder.idle-timeout` CLI argument (default: 5 minutes). + +Accounts whose actors crash repeatedly (due to database errors) are automatically blacklisted after a configurable +number of failures, preventing resource exhaustion. The threshold can be set with +`--ntx-builder.max-actor-crashes` (default: 10). diff --git a/docs/internal/src/ntx-builder.md b/docs/internal/src/ntx-builder.md index a662f7658..5b21a028f 100644 --- a/docs/internal/src/ntx-builder.md +++ b/docs/internal/src/ntx-builder.md @@ -51,5 +51,11 @@ argument (default: 5 minutes). Deactivated actors are re-spawned when new notes targeting their account are detected by the coordinator (via the `send_targeted` path). +If an actor repeatedly crashes (shuts down due to a database error), its crash count is tracked by +the coordinator. Once the count reaches the configurable threshold, the account is **blacklisted** +and no new actor will be spawned for it. This prevents resource exhaustion from a persistently +failing account. The threshold is configurable via the `--ntx-builder.max-actor-crashes` CLI +argument (default: 10). + The block-producer remains blissfully unaware of network transactions. From its perspective a network transaction is simply the same as any other.