diff --git a/Cargo.lock b/Cargo.lock index d938586b8..014747f9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1116,7 +1116,6 @@ name = "system-info" version = "0.1.0" dependencies = [ "libc", - "rayon", ] [[package]] diff --git a/crates/backend/system-info/Cargo.toml b/crates/backend/system-info/Cargo.toml index c63ee1297..862e36e89 100644 --- a/crates/backend/system-info/Cargo.toml +++ b/crates/backend/system-info/Cargo.toml @@ -5,7 +5,6 @@ edition.workspace = true [dependencies] libc = "0.2" -rayon.workspace = true [lints] workspace = true diff --git a/crates/backend/system-info/src/lib.rs b/crates/backend/system-info/src/lib.rs index 07180559b..5323c1ce4 100644 --- a/crates/backend/system-info/src/lib.rs +++ b/crates/backend/system-info/src/lib.rs @@ -9,36 +9,3 @@ pub fn peak_rss_bytes() -> u64 { // ru_maxrss unit: bytes on macOS, KiB on Linux. if cfg!(target_os = "macos") { max } else { max * 1024 } } - -/// Number of jobs [`flush_rayon`] pushes. Must exceed -/// `crossbeam_deque::deque::BLOCK_CAP` (currently 63 — -/// `crossbeam-deque-0.8.6/src/deque.rs:1191`). -const RAYON_FLUSH_JOBS: usize = 256; - -/// Drain rayon's internal queues so they release any storage allocated during the -/// previous phase. -/// -/// Rayon's global pool owns a `crossbeam_deque::Injector`, internally a linked list -/// of fixed-size blocks (`Block` and `Injector::push` — -/// `crossbeam-deque-0.8.6/src/deque.rs:1219` and `:1371`). A block is freed only -/// once its last slot has been consumed. -/// -/// `rayon::join` from a non-worker thread reaches that injector via -/// `join` (`rayon-core-1.13.0/src/join/mod.rs:132`) -> -/// `registry::in_worker` (`registry.rs:946`) -> -/// `Registry::in_worker_cold` (`:517`) -> -/// `Registry::inject` (`:428`) -> `Injector::push`. -/// -/// Under an arena allocator that recycles memory between phases (e.g. `zk-alloc`), -/// a block allocated *during* a phase points into a slab the next `begin_phase()` -/// will reuse. The next push then writes a `JobRef` straight through whatever the -/// application has placed on top, silently corrupting it. -/// -/// Pushing more than `BLOCK_CAP` jobs while the arena is off forces the Injector -/// to allocate a fresh tail block (which lands in System), and forces workers to -/// steal the last slot of every preceding block (which destroys them). -pub fn flush_rayon() { - for _ in 0..RAYON_FLUSH_JOBS { - rayon::join(|| {}, || {}); - } -} diff --git a/crates/backend/zk-alloc/src/lib.rs b/crates/backend/zk-alloc/src/lib.rs index cae70642f..8aab4ba21 100644 --- a/crates/backend/zk-alloc/src/lib.rs +++ b/crates/backend/zk-alloc/src/lib.rs @@ -30,6 +30,29 @@ const SLACK: usize = 4; // SLACK absorbs the main thread and any non-rayon helpe const MAX_THREADS: usize = NUM_THREADS + SLACK; const REGION_SIZE: usize = SLAB_SIZE * MAX_THREADS; +/// Allocations smaller than this go to System even during active phases. +/// Routes registry / hashmap / injector-block-sized allocations away from the +/// arena, so library state that outlives a phase doesn't land in recycled +/// memory. Covers the known phase-crossing patterns: crossbeam_deque::Injector +/// blocks (~1.5 KB), tracing-subscriber Registry slot data (sub-KB), hashbrown +/// HashMap entries (sub-KB), rayon-core job stack frames (sub-KB). +/// +/// TODO is there a cleaner way? +/// +/// Lowered from 4096 to 256 on M2 once THP-backed arena landed (iter 8): allocs +/// in the arena now hit a 32 MiB hugepage TLB entry whereas System allocs land +/// on 16 KiB base pages. Pushing the 256..4095 size band into the arena buys +/// the hugepage TLB benefit for more allocations. Phase-crossing safety: the +/// named ~1.5 KB Injector blocks still bypass via System (still in the +/// [0, 256) carve-out? No — Injector blocks are ~1.5 KB > 256). Risk: any +/// phase-crossing allocation in [256, 1500) is now in the arena and gets +/// recycled. Sticky-System realloc still protects grown Vecs that started in +/// System. Correctness gate enforces. +#[cfg(target_arch = "aarch64")] +const MIN_ARENA_BYTES: usize = 256; +#[cfg(not(target_arch = "aarch64"))] +const MIN_ARENA_BYTES: usize = 4096; + #[derive(Debug)] pub struct ZkAllocator; @@ -76,14 +99,87 @@ thread_local! { /// Returns the base address of the mmap'd region, mapping it on the first call. fn ensure_region() -> usize { REGION_INIT.call_once(|| { + // On aarch64 Linux (M2/Asahi) THP page size is 32 MiB. We over-allocate + // by THP_SIZE so we can round REGION_BASE up to a 32 MiB boundary, which + // is what khugepaged needs to collapse base pages into hugepages. Without + // this alignment MADV_HUGEPAGE is observed to fire only intermittently + // (iter 7: real signal but p=0.019 not p<0.01). With alignment + an + // eager touch (one write per 32 MiB) the kernel collapses the touched + // region into THP synchronously, making the win deterministic. + #[cfg(target_arch = "aarch64")] + const THP_SIZE: usize = 32 << 20; // 32 MiB on M2 Asahi + #[cfg(not(target_arch = "aarch64"))] + const THP_SIZE: usize = 0; + + let mmap_size = REGION_SIZE + THP_SIZE; // SAFETY: mmap_anonymous returns a page-aligned pointer or null. MAP_NORESERVE // means no physical memory is committed until pages are touched. - let ptr = unsafe { syscall::mmap_anonymous(REGION_SIZE) }; - if ptr.is_null() { + let raw = unsafe { syscall::mmap_anonymous(mmap_size) }; + if raw.is_null() { std::process::abort(); } - unsafe { syscall::madvise(ptr, REGION_SIZE, syscall::MADV_NOHUGEPAGE) }; - REGION_BASE.store(ptr as usize, Ordering::Release); + + #[cfg(target_arch = "aarch64")] + let aligned_base = (raw as usize).next_multiple_of(THP_SIZE); + #[cfg(not(target_arch = "aarch64"))] + let aligned_base = raw as usize; + + // On aarch64, ask khugepaged to use THP for the slab region. On x86_64 + // preserve the historical NOHUGEPAGE hint (2 MiB THP can fragment slab + // release; documented original choice). + #[cfg(target_arch = "aarch64")] + let advice = syscall::MADV_HUGEPAGE; + #[cfg(not(target_arch = "aarch64"))] + let advice = syscall::MADV_NOHUGEPAGE; + unsafe { syscall::madvise(aligned_base as *mut u8, REGION_SIZE, advice) }; + + // Eager pre-touch on aarch64: write one byte per 32 MiB hugepage across + // the first `pretouch_bytes` of every per-thread slab. Each write + // triggers a page fault that the kernel resolves into a 32 MiB THP + // given our earlier MADV_HUGEPAGE hint and the 32 MiB-aligned base. + // This makes the THP win deterministic instead of + // khugepaged-async-dependent. + // + // Adapt `pretouch_bytes` to MemTotal (was a hard-coded 1 GiB in iter 8). + // The 1 GiB const × MAX_THREADS=14 = 14 GiB pre-touch overshoots the + // 16 GiB Asahi M2 box: the eval gate's prove_loop_cand was OOM-killed + // twice with anon-rss ~14.3 GiB on 2026-05-11 (journalctl). Cap at + // MemTotal / MAX_THREADS / OVERCOMMIT_GUARD so total pre-touch stays + // under MemTotal/3, leaving room for the workload's own ~10 GiB + // touched footprint and the rest of the process. + // - 16 GiB / 14 / 3 ≈ 390 MiB per slab → ~5.4 GiB pre-touched + // - 64 GiB / 14 / 3 ≈ 1.56 GiB per slab → capped at 1 GiB ceiling + // Floor at THP_SIZE so we still pre-touch at least one hugepage per + // slab if `total_ram_bytes()` returns a degenerately small value or + // fails (returns 0 → fall back to THP_SIZE). + // Runs in REGION_INIT.call_once, well before any timed proof window. + #[cfg(target_arch = "aarch64")] + { + const PRETOUCH_HARD_CAP: usize = 1 << 30; // 1 GiB ceiling per slab + const OVERCOMMIT_GUARD: usize = 3; // total pre-touch ≤ MemTotal/3 + // SAFETY: total_ram_bytes is allocation-free (sysinfo syscall into stack buffer). + let mem_total = unsafe { syscall::total_ram_bytes() }; + let pretouch_bytes = if mem_total == 0 { + THP_SIZE + } else { + let budget = mem_total / MAX_THREADS / OVERCOMMIT_GUARD; + budget.clamp(THP_SIZE, PRETOUCH_HARD_CAP) + }; + for slab_idx in 0..MAX_THREADS { + let slab_base = aligned_base + slab_idx * SLAB_SIZE; + let mut off = 0; + while off < pretouch_bytes { + // SAFETY: aligned_base..aligned_base+REGION_SIZE is a valid + // anonymous mmap reservation; we only touch within slab. + unsafe { + std::ptr::write_volatile((slab_base + off) as *mut u8, 0); + } + off += THP_SIZE; + } + } + } + + REGION_BASE.store(aligned_base, Ordering::Release); }); REGION_BASE.load(Ordering::Acquire) } @@ -99,19 +195,22 @@ pub fn init() { /// Activates the arena and resets every thread's slab. All allocations until the next /// `end_phase()` go to the arena; the previous phase's data is overwritten in place. +/// +/// Panics if a phase is already active: phases must not nest (a nested call would +/// recycle the slab and overwrite the outer phase's still-live allocations). pub fn begin_phase() { + let prev_active = ARENA_ACTIVE.swap(true, Ordering::Release); + assert!( + !prev_active, + "begin_phase() called while another phase is already active — phases must not nest" + ); GENERATION.fetch_add(1, Ordering::Release); - ARENA_ACTIVE.store(true, Ordering::Release); } /// Deactivates the arena. New allocations go to the system allocator; existing arena /// pointers stay valid until the next `begin_phase()` resets the slabs. -/// -/// Also calls [`system_info::flush_rayon`] to release any rayon/crossbeam storage -/// still referencing this phase's arena memory. pub fn end_phase() { ARENA_ACTIVE.store(false, Ordering::Release); - system_info::flush_rayon(); } #[cold] @@ -152,6 +251,15 @@ unsafe impl GlobalAlloc for ZkAllocator { #[inline(always)] unsafe fn alloc(&self, layout: Layout) -> *mut u8 { if ARENA_ACTIVE.load(Ordering::Relaxed) { + // Small allocs bypass arena: registry slots / HashMap entries / + // injector-block-sized allocations from rayon/tracing libraries + // commonly outlive a phase. Routing them to System keeps them + // safe across begin_phase()/end_phase() boundaries. + // + // TODO is there a cleaner way? + if layout.size() < MIN_ARENA_BYTES { + return unsafe { std::alloc::System.alloc(layout) }; + } let generation = GENERATION.load(Ordering::Relaxed); if ARENA_GEN.get() == generation { let align = layout.align(); @@ -182,11 +290,22 @@ unsafe impl GlobalAlloc for ZkAllocator { if new_size <= layout.size() { return ptr; } + // Sticky-System routing: if the original allocation came from System + // (small, or pre-phase, or routed by size-routing), keep the grown + // allocation in System too. Without this, a Vec allocated outside a + // phase that grows inside one would silently migrate into the arena + // and become subject to phase recycling. + let addr = ptr as usize; + let base = REGION_BASE.load(Ordering::Relaxed); + let in_arena = base != 0 && addr >= base && addr < base + REGION_SIZE; + if !in_arena { + return unsafe { std::alloc::System.realloc(ptr, layout, new_size) }; + } // SAFETY: new_size > layout.size() > 0, align unchanged from valid layout. let new_layout = unsafe { Layout::from_size_align_unchecked(new_size, layout.align()) }; let new_ptr = unsafe { self.alloc(new_layout) }; if !new_ptr.is_null() { - unsafe { std::ptr::copy_nonoverlapping(ptr, new_ptr, layout.size()) }; + unsafe { std::ptr::copy(ptr, new_ptr, layout.size()) }; unsafe { self.dealloc(ptr, layout) }; } new_ptr diff --git a/crates/backend/zk-alloc/src/syscall.rs b/crates/backend/zk-alloc/src/syscall.rs index f9bea4fab..7f0d7efa6 100644 --- a/crates/backend/zk-alloc/src/syscall.rs +++ b/crates/backend/zk-alloc/src/syscall.rs @@ -15,6 +15,7 @@ mod imp { const MAP_ANONYMOUS: usize = 0x20; const MAP_NORESERVE: usize = 0x4000; + pub const MADV_HUGEPAGE: usize = 14; pub const MADV_NOHUGEPAGE: usize = 15; #[inline] @@ -70,17 +71,121 @@ mod imp { } } -#[cfg(not(all(target_os = "linux", target_arch = "x86_64")))] +#[cfg(all(target_os = "linux", target_arch = "aarch64"))] mod imp { use std::ptr; + const SYS_MMAP: usize = 222; + const SYS_MADVISE: usize = 233; + const SYS_SYSINFO: usize = 179; + + const PROT_READ: usize = 1; + const PROT_WRITE: usize = 2; + const MAP_PRIVATE: usize = 0x02; + const MAP_ANONYMOUS: usize = 0x20; + const MAP_NORESERVE: usize = 0x4000; + + pub const MADV_HUGEPAGE: usize = 14; + pub const MADV_NOHUGEPAGE: usize = 15; + + #[inline] + unsafe fn syscall6(nr: usize, a1: usize, a2: usize, a3: usize, a4: usize, a5: usize, a6: usize) -> isize { + let ret: isize; + unsafe { + std::arch::asm!( + "svc 0", + in("x8") nr, + inlateout("x0") a1 as isize => ret, + in("x1") a2, + in("x2") a3, + in("x3") a4, + in("x4") a5, + in("x5") a6, + options(nostack), + ); + } + ret + } + + #[inline] + unsafe fn syscall3(nr: usize, a1: usize, a2: usize, a3: usize) -> isize { + let ret: isize; + unsafe { + std::arch::asm!( + "svc 0", + in("x8") nr, + inlateout("x0") a1 as isize => ret, + in("x1") a2, + in("x2") a3, + options(nostack), + ); + } + ret + } + + #[inline] + unsafe fn syscall1(nr: usize, a1: usize) -> isize { + let ret: isize; + unsafe { + std::arch::asm!( + "svc 0", + in("x8") nr, + inlateout("x0") a1 as isize => ret, + options(nostack), + ); + } + ret + } + + /// Returns the system's total RAM in bytes via the `sysinfo(2)` syscall, or + /// 0 on failure. Allocation-free: writes the kernel struct into a stack + /// buffer, no libc / no Vec / no String. Safe to call from inside + /// `#[global_allocator]` initialisation. + /// + /// Layout of `struct sysinfo` on 64-bit Linux (kernel/asm-generic): + /// off 0 long uptime + /// off 8 ulong loads[3] + /// off 32 ulong totalram <-- the field we want + /// off 40 ulong freeram + /// ... + /// off108 u32 mem_unit <-- multiplier (always 1 on 64-bit) + pub unsafe fn total_ram_bytes() -> usize { + let mut buf = [0u8; 128]; + let ret = unsafe { syscall1(SYS_SYSINFO, buf.as_mut_ptr() as usize) }; + if ret < 0 { + return 0; + } + let totalram = + u64::from_ne_bytes([buf[32], buf[33], buf[34], buf[35], buf[36], buf[37], buf[38], buf[39]]) as usize; + let mem_unit = u32::from_ne_bytes([buf[108], buf[109], buf[110], buf[111]]) as usize; + totalram.saturating_mul(mem_unit.max(1)) + } + + #[inline] + pub unsafe fn mmap_anonymous(size: usize) -> *mut u8 { + let flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + let ret = unsafe { syscall6(SYS_MMAP, 0, size, PROT_READ | PROT_WRITE, flags, usize::MAX, 0) }; + if ret < 0 { ptr::null_mut() } else { ret as *mut u8 } + } + + #[inline] + pub unsafe fn madvise(ptr: *mut u8, size: usize, advice: usize) { + unsafe { syscall3(SYS_MADVISE, ptr as usize, size, advice) }; + } +} + +#[cfg(not(all(target_os = "linux", any(target_arch = "x86_64", target_arch = "aarch64"))))] +mod imp { + use std::ptr; + + pub const MADV_HUGEPAGE: usize = 14; pub const MADV_NOHUGEPAGE: usize = 15; #[inline] pub unsafe fn mmap_anonymous(size: usize) -> *mut u8 { // MAP_NORESERVE is Linux-only. macOS lazily backs anonymous mappings - // with physical memory by default, so the large virtual reservation we - // make is fine without NORESERVE. + // with physical memory by default, so the large virtual reservation + // is fine without NORESERVE. let prot = libc::PROT_READ | libc::PROT_WRITE; let flags = libc::MAP_PRIVATE | libc::MAP_ANON; let ret = unsafe { libc::mmap(ptr::null_mut(), size, prot, flags, -1, 0) }; @@ -97,4 +202,8 @@ mod imp { } } -pub use imp::{MADV_NOHUGEPAGE, madvise, mmap_anonymous}; +#[cfg(not(target_arch = "aarch64"))] +pub use imp::MADV_NOHUGEPAGE; +#[cfg(target_arch = "aarch64")] +pub use imp::{MADV_HUGEPAGE, total_ram_bytes}; +pub use imp::{madvise, mmap_anonymous}; diff --git a/crates/backend/zk-alloc/tests/test_rayon.rs b/crates/backend/zk-alloc/tests/test_rayon.rs index ae084af21..eebcfe5cf 100644 --- a/crates/backend/zk-alloc/tests/test_rayon.rs +++ b/crates/backend/zk-alloc/tests/test_rayon.rs @@ -1,4 +1,6 @@ -//! Regression test for the bug prevented by `system_info::flush_rayon`. +//! Regression test for arena/rayon corruption: rayon's `crossbeam_deque::Injector` +//! blocks (~1.5 KB) used to land in the arena and outlive a phase. Now prevented +//! by `MIN_ARENA_BYTES` size-routing in `ZkAllocator::alloc`. use rayon::prelude::*;