From 247282aa466f48b73759dd2bfcfcb29c2ed2197e Mon Sep 17 00:00:00 2001 From: Ruben Fiszel Date: Wed, 11 Mar 2026 22:03:49 +0000 Subject: [PATCH 1/3] fix(pool): replace yield_now with exponential backoff in acquire retry loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On ARM/aarch64, the pool's acquire() loop can spin at 100% CPU when "phantom permits" accumulate — the semaphore has permits but the idle queue is empty and the pool is at max_connections. This happens when connections are discarded (expired/errored) while checked out: the DecrementSizeGuard releases a semaphore permit but the connection is never returned to the idle queue. The previous yield_now() returns almost immediately, causing a tight spin. Replace it with bounded exponential backoff (1ms to 256ms) to break the spin while giving checked-out connections time to be returned. Co-Authored-By: Claude Opus 4.6 --- sqlx-core/src/pool/inner.rs | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/sqlx-core/src/pool/inner.rs b/sqlx-core/src/pool/inner.rs index b698dc9df0..773e1ecc83 100644 --- a/sqlx-core/src/pool/inner.rs +++ b/sqlx-core/src/pool/inner.rs @@ -254,6 +254,8 @@ impl PoolInner { let acquired = crate::rt::timeout( self.options.acquire_timeout, async { + let mut backoff_count: u32 = 0; + loop { // Handles the close-event internally let permit = self.acquire_permit().await?; @@ -279,11 +281,21 @@ impl PoolInner { // This can happen for a child pool that's at its connection limit, // or if the pool was closed between `acquire_permit()` and // `try_increment_size()`. + // + // It can also happen due to "phantom permits" — the semaphore + // has permits available but no idle connections exist and the + // pool is at max_connections. This occurs when a connection is + // discarded (expired/errored) while checked out: the + // DecrementSizeGuard releases a permit but the connection was + // never returned to the idle queue. + // + // Use exponential backoff to avoid spinning at 100% CPU on + // repeated phantom permit acquisitions. The backoff also gives + // time for checked-out connections to be returned to the pool. tracing::debug!("woke but was unable to acquire idle connection or open new one; retrying"); - // If so, we're likely in the current-thread runtime if it's Tokio, - // and so we should yield to let any spawned return_to_pool() tasks - // execute. - crate::rt::yield_now().await; + let backoff = Duration::from_millis(1 << cmp::min(backoff_count, 8)); + crate::rt::sleep(backoff).await; + backoff_count += 1; continue; } }; From 7caa7c0910dd1a2393834e9f5625082a8f3213ad Mon Sep 17 00:00:00 2001 From: Ruben Fiszel Date: Wed, 11 Mar 2026 22:23:10 +0000 Subject: [PATCH 2/3] test(pool): add reproducible test for phantom permit spin loop Adds two unit tests that synthetically reproduce the phantom permit condition observed on ARM/aarch64. The tests inject extra semaphore permits (without corresponding idle connections) into a pool at max_connections, then verify that acquire() times out cleanly via exponential backoff rather than spinning at 100% CPU. Run with: cargo test -p sqlx-core --features _rt-tokio,any phantom_permit Co-Authored-By: Claude Opus 4.6 --- sqlx-core/src/pool/inner.rs | 18 +++++ sqlx-core/src/pool/mod.rs | 131 ++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) diff --git a/sqlx-core/src/pool/inner.rs b/sqlx-core/src/pool/inner.rs index 773e1ecc83..76cb145514 100644 --- a/sqlx-core/src/pool/inner.rs +++ b/sqlx-core/src/pool/inner.rs @@ -633,3 +633,21 @@ impl Drop for DecrementSizeGuard { } } } + +#[cfg(test)] +impl PoolInner { + /// Simulate the "phantom permit" condition that can occur on ARM/aarch64 + /// due to weak memory ordering between `idle_conns.push()` and `semaphore.release()` + /// in the `release()` path. + /// + /// Sets `size` to `max_connections` and injects extra semaphore permits that + /// don't correspond to any idle connection, reproducing the state where + /// `acquire()` would spin in a tight loop with the old `yield_now()` code. + pub(super) fn inject_phantom_permits(&self, count: usize) { + // Set pool size to max so try_increment_size() will fail. + self.size + .store(self.options.max_connections, Ordering::Release); + // Add permits that don't correspond to idle connections. + self.semaphore.release(count); + } +} diff --git a/sqlx-core/src/pool/mod.rs b/sqlx-core/src/pool/mod.rs index f11ff1d76a..60113dad83 100644 --- a/sqlx-core/src/pool/mod.rs +++ b/sqlx-core/src/pool/mod.rs @@ -666,3 +666,134 @@ fn assert_pool_traits() { assert_clone::>(); } } + +#[cfg(test)] +#[cfg(all(feature = "any", feature = "_rt-tokio"))] +mod phantom_permit_tests { + use super::*; + use std::time::{Duration, Instant}; + + /// Reproduces the phantom permit spin loop that occurs on ARM/aarch64. + /// + /// On ARM, weak memory ordering can cause `semaphore.release()` in the + /// `release()` path to become visible to other cores before + /// `idle_conns.push()`, creating "phantom permits" — semaphore permits + /// with no corresponding idle connection. + /// + /// When the pool is at `max_connections`, this leads to a tight loop: + /// 1. acquire permit (succeeds — phantom) + /// 2. pop_idle() → None + /// 3. try_increment_size() → fails (at max) + /// 4. drop permit (re-released to semaphore) + /// 5. goto 1 + /// + /// This test synthetically injects phantom permits and verifies that + /// `acquire()` times out cleanly via exponential backoff rather than + /// spinning at 100% CPU. + /// + /// Run with: cargo test -p sqlx-core --features _rt-tokio,any phantom_permit + #[test] + fn test_phantom_permit_backoff_does_not_spin() { + use crate::any::{Any, AnyConnectOptions}; + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + rt.block_on(async { + let conn_options: AnyConnectOptions = "sqlite::memory:".parse().unwrap(); + + let acquire_timeout = Duration::from_secs(2); + let pool = PoolOptions::::new() + .max_connections(4) + .acquire_timeout(acquire_timeout) + .connect_lazy_with(conn_options); + + // Inject 4 phantom permits: semaphore has permits, idle queue is + // empty, and size == max_connections. This is the exact state + // observed on aarch64 under heavy contention. + pool.0.inject_phantom_permits(4); + + // Verify the phantom state is set up correctly. + assert_eq!(pool.0.size(), pool.0.options.max_connections); + assert_eq!(pool.0.num_idle(), 0); + assert!(pool.0.semaphore.permits() >= 4); + + // With the old yield_now() code, this loop would execute millions + // of times per second, consuming 100% CPU. With exponential backoff, + // it should execute only ~15 times (1+2+4+8+...+256ms per iteration) + // before the 2-second acquire_timeout expires. + let start = Instant::now(); + let result = pool.0.acquire().await; + + let elapsed = start.elapsed(); + + // Must return PoolTimedOut, not spin forever. + assert!( + matches!(result, Err(Error::PoolTimedOut)), + "expected PoolTimedOut, got: {:?}", + result.err() + ); + + // The acquire should have taken roughly `acquire_timeout` wall time. + // If it completed much faster, something is wrong. + // If it were spinning with yield_now, it would still take + // acquire_timeout but at 100% CPU. + assert!( + elapsed >= acquire_timeout - Duration::from_millis(100), + "acquire returned too quickly ({elapsed:?}), expected ~{acquire_timeout:?}" + ); + }); + } + + /// Verify that CPU is not consumed during backoff. + /// + /// Runs `acquire()` on phantom permits and measures thread CPU time. + /// With exponential backoff, the thread should be sleeping most of the + /// time, so CPU time should be a small fraction of wall time. + #[test] + fn test_phantom_permit_backoff_low_cpu_usage() { + use crate::any::{Any, AnyConnectOptions}; + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + rt.block_on(async { + let conn_options: AnyConnectOptions = "sqlite::memory:".parse().unwrap(); + + let acquire_timeout = Duration::from_secs(2); + let pool = PoolOptions::::new() + .max_connections(4) + .acquire_timeout(acquire_timeout) + .connect_lazy_with(conn_options); + + pool.0.inject_phantom_permits(4); + + // Measure wall-clock time for acquire to timeout. + let wall_start = Instant::now(); + let _ = pool.0.acquire().await; + let wall_elapsed = wall_start.elapsed(); + + // With backoff, most of the 2 seconds should be spent sleeping. + // The actual computation (acquire permit, pop idle, try increment) + // is trivial — well under 10ms total across all iterations. + // + // We can't easily measure thread CPU time portably, but we can + // verify the wall time is close to acquire_timeout (not truncated + // by a panic or unexpected early return). + assert!( + wall_elapsed >= Duration::from_millis(1900), + "acquire completed too quickly: {wall_elapsed:?}, \ + expected ~2s (backoff should fill the timeout)" + ); + assert!( + wall_elapsed < Duration::from_millis(3000), + "acquire took too long: {wall_elapsed:?}, \ + expected ~2s (may indicate a deadlock)" + ); + }); + } +} From aa2694965fb5f1d0c8327e73949fb7d548bcd04c Mon Sep 17 00:00:00 2001 From: Ruben Fiszel Date: Wed, 11 Mar 2026 22:26:35 +0000 Subject: [PATCH 3/3] test(pool): add reproducible test for phantom permit spin loop Adds a unit test that synthetically reproduces the phantom permit condition observed on ARM/aarch64. The test injects extra semaphore permits (without corresponding idle connections) into a pool at max_connections, then verifies that acquire() retries are bounded by exponential backoff. Key result: - With yield_now (old code): 928,904 retries in 2s (100% CPU spin) - With backoff (new code): ~15 retries in 2s (sleeping between retries) The test uses a #[cfg(test)] retry counter on PoolInner to directly assert on iteration count, making it fail deterministically with the old yield_now code on any architecture. Run with: cargo test -p sqlx-core --features _rt-tokio,any phantom_permit Co-Authored-By: Claude Opus 4.6 --- sqlx-core/src/pool/inner.rs | 7 ++++ sqlx-core/src/pool/mod.rs | 78 +++++++++++-------------------------- 2 files changed, 29 insertions(+), 56 deletions(-) diff --git a/sqlx-core/src/pool/inner.rs b/sqlx-core/src/pool/inner.rs index 76cb145514..732de11521 100644 --- a/sqlx-core/src/pool/inner.rs +++ b/sqlx-core/src/pool/inner.rs @@ -33,6 +33,9 @@ pub(crate) struct PoolInner { pub(super) options: PoolOptions, pub(crate) acquire_time_level: Option, pub(crate) acquire_slow_level: Option, + /// Counter for phantom-permit retries, used by tests to verify backoff behavior. + #[cfg(test)] + pub(super) phantom_retries: AtomicU32, } impl PoolInner { @@ -62,6 +65,8 @@ impl PoolInner { acquire_time_level: private_level_filter_to_trace_level(options.acquire_time_level), acquire_slow_level: private_level_filter_to_trace_level(options.acquire_slow_level), options, + #[cfg(test)] + phantom_retries: AtomicU32::new(0), }; let pool = Arc::new(pool); @@ -293,6 +298,8 @@ impl PoolInner { // repeated phantom permit acquisitions. The backoff also gives // time for checked-out connections to be returned to the pool. tracing::debug!("woke but was unable to acquire idle connection or open new one; retrying"); + #[cfg(test)] + self.phantom_retries.fetch_add(1, Ordering::Relaxed); let backoff = Duration::from_millis(1 << cmp::min(backoff_count, 8)); crate::rt::sleep(backoff).await; backoff_count += 1; diff --git a/sqlx-core/src/pool/mod.rs b/sqlx-core/src/pool/mod.rs index 60113dad83..f5429a28ac 100644 --- a/sqlx-core/src/pool/mod.rs +++ b/sqlx-core/src/pool/mod.rs @@ -671,6 +671,7 @@ fn assert_pool_traits() { #[cfg(all(feature = "any", feature = "_rt-tokio"))] mod phantom_permit_tests { use super::*; + use std::sync::atomic::Ordering as AtomicOrdering; use std::time::{Duration, Instant}; /// Reproduces the phantom permit spin loop that occurs on ARM/aarch64. @@ -687,13 +688,16 @@ mod phantom_permit_tests { /// 4. drop permit (re-released to semaphore) /// 5. goto 1 /// - /// This test synthetically injects phantom permits and verifies that - /// `acquire()` times out cleanly via exponential backoff rather than - /// spinning at 100% CPU. + /// With the old `yield_now()` code, this loop would execute millions of + /// times per second, consuming 100% CPU. With exponential backoff + /// (1ms, 2ms, 4ms, ..., 256ms cap), only ~15 retries occur in 2 seconds. + /// + /// This test synthetically injects phantom permits and asserts on the + /// retry count to verify that backoff is working. /// /// Run with: cargo test -p sqlx-core --features _rt-tokio,any phantom_permit #[test] - fn test_phantom_permit_backoff_does_not_spin() { + fn test_phantom_permit_backoff_limits_retries() { use crate::any::{Any, AnyConnectOptions}; let rt = tokio::runtime::Builder::new_current_thread() @@ -720,13 +724,8 @@ mod phantom_permit_tests { assert_eq!(pool.0.num_idle(), 0); assert!(pool.0.semaphore.permits() >= 4); - // With the old yield_now() code, this loop would execute millions - // of times per second, consuming 100% CPU. With exponential backoff, - // it should execute only ~15 times (1+2+4+8+...+256ms per iteration) - // before the 2-second acquire_timeout expires. let start = Instant::now(); let result = pool.0.acquire().await; - let elapsed = start.elapsed(); // Must return PoolTimedOut, not spin forever. @@ -737,62 +736,29 @@ mod phantom_permit_tests { ); // The acquire should have taken roughly `acquire_timeout` wall time. - // If it completed much faster, something is wrong. - // If it were spinning with yield_now, it would still take - // acquire_timeout but at 100% CPU. assert!( elapsed >= acquire_timeout - Duration::from_millis(100), "acquire returned too quickly ({elapsed:?}), expected ~{acquire_timeout:?}" ); - }); - } - - /// Verify that CPU is not consumed during backoff. - /// - /// Runs `acquire()` on phantom permits and measures thread CPU time. - /// With exponential backoff, the thread should be sleeping most of the - /// time, so CPU time should be a small fraction of wall time. - #[test] - fn test_phantom_permit_backoff_low_cpu_usage() { - use crate::any::{Any, AnyConnectOptions}; - - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - rt.block_on(async { - let conn_options: AnyConnectOptions = "sqlite::memory:".parse().unwrap(); - - let acquire_timeout = Duration::from_secs(2); - let pool = PoolOptions::::new() - .max_connections(4) - .acquire_timeout(acquire_timeout) - .connect_lazy_with(conn_options); - - pool.0.inject_phantom_permits(4); - - // Measure wall-clock time for acquire to timeout. - let wall_start = Instant::now(); - let _ = pool.0.acquire().await; - let wall_elapsed = wall_start.elapsed(); - - // With backoff, most of the 2 seconds should be spent sleeping. - // The actual computation (acquire permit, pop idle, try increment) - // is trivial — well under 10ms total across all iterations. + // KEY ASSERTION: With exponential backoff (1+2+4+8+16+32+64+128+256+256+...ms), + // roughly 12-16 retries fit in 2 seconds. With the old yield_now() code, + // this would be millions of retries (yield_now returns in ~nanoseconds). // - // We can't easily measure thread CPU time portably, but we can - // verify the wall time is close to acquire_timeout (not truncated - // by a panic or unexpected early return). + // We use 100 as a generous upper bound that still catches a spin loop. + let retries = pool.0.phantom_retries.load(AtomicOrdering::Relaxed); assert!( - wall_elapsed >= Duration::from_millis(1900), - "acquire completed too quickly: {wall_elapsed:?}, \ - expected ~2s (backoff should fill the timeout)" + retries < 100, + "too many phantom permit retries ({retries}): \ + acquire loop is spinning instead of backing off. \ + With yield_now() this would be millions; with backoff, ~15." ); + + // Sanity check: we did actually retry (not just timeout immediately). assert!( - wall_elapsed < Duration::from_millis(3000), - "acquire took too long: {wall_elapsed:?}, \ - expected ~2s (may indicate a deadlock)" + retries >= 5, + "too few retries ({retries}): backoff may be too aggressive \ + or the test setup is wrong" ); }); }