leanEthereum · Barnadrot · May 6, 2026 · May 9, 2026 · May 10, 2026 · May 11, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/backend/system-info/Cargo.toml b/crates/backend/system-info/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 
 [dependencies]
 libc = "0.2"
-rayon.workspace = true
 
 [lints]
 workspace = true
diff --git a/crates/backend/system-info/src/lib.rs b/crates/backend/system-info/src/lib.rs
@@ -9,36 +9,3 @@ pub fn peak_rss_bytes() -> u64 {
     // ru_maxrss unit: bytes on macOS, KiB on Linux.
     if cfg!(target_os = "macos") { max } else { max * 1024 }
 }
-
-/// Number of jobs [`flush_rayon`] pushes. Must exceed
-/// `crossbeam_deque::deque::BLOCK_CAP` (currently 63 —
-/// `crossbeam-deque-0.8.6/src/deque.rs:1191`).
-const RAYON_FLUSH_JOBS: usize = 256;
-
-/// Drain rayon's internal queues so they release any storage allocated during the
-/// previous phase.
-///
-/// Rayon's global pool owns a `crossbeam_deque::Injector`, internally a linked list
-/// of fixed-size blocks (`Block` and `Injector::push` —
-/// `crossbeam-deque-0.8.6/src/deque.rs:1219` and `:1371`). A block is freed only
-/// once its last slot has been consumed.
-///
-/// `rayon::join` from a non-worker thread reaches that injector via
-/// `join` (`rayon-core-1.13.0/src/join/mod.rs:132`) ->
-/// `registry::in_worker` (`registry.rs:946`) ->
-/// `Registry::in_worker_cold` (`:517`) ->
-/// `Registry::inject` (`:428`) -> `Injector::push`.
-///
-/// Under an arena allocator that recycles memory between phases (e.g. `zk-alloc`),
-/// a block allocated *during* a phase points into a slab the next `begin_phase()`
-/// will reuse. The next push then writes a `JobRef` straight through whatever the
-/// application has placed on top, silently corrupting it.
-///
-/// Pushing more than `BLOCK_CAP` jobs while the arena is off forces the Injector                                        
-/// to allocate a fresh tail block (which lands in System), and forces workers to                                      
-/// steal the last slot of every preceding block (which destroys them).
-pub fn flush_rayon() {
-    for _ in 0..RAYON_FLUSH_JOBS {
-        rayon::join(|| {}, || {});
-    }
-}
diff --git a/crates/backend/zk-alloc/src/lib.rs b/crates/backend/zk-alloc/src/lib.rs
@@ -30,6 +30,29 @@ const SLACK: usize = 4; // SLACK absorbs the main thread and any non-rayon helpe
 const MAX_THREADS: usize = NUM_THREADS + SLACK;
 const REGION_SIZE: usize = SLAB_SIZE * MAX_THREADS;
 
+/// Allocations smaller than this go to System even during active phases.
+/// Routes registry / hashmap / injector-block-sized allocations away from the
+/// arena, so library state that outlives a phase doesn't land in recycled
+/// memory. Covers the known phase-crossing patterns: crossbeam_deque::Injector
+/// blocks (~1.5 KB), tracing-subscriber Registry slot data (sub-KB), hashbrown
+/// HashMap entries (sub-KB), rayon-core job stack frames (sub-KB).
+///
+/// TODO is there a cleaner way?
+///
+/// Lowered from 4096 to 256 on M2 once THP-backed arena landed (iter 8): allocs
+/// in the arena now hit a 32 MiB hugepage TLB entry whereas System allocs land
+/// on 16 KiB base pages. Pushing the 256..4095 size band into the arena buys
+/// the hugepage TLB benefit for more allocations. Phase-crossing safety: the
+/// named ~1.5 KB Injector blocks still bypass via System (still in the
+/// [0, 256) carve-out? No — Injector blocks are ~1.5 KB > 256). Risk: any
+/// phase-crossing allocation in [256, 1500) is now in the arena and gets
+/// recycled. Sticky-System realloc still protects grown Vecs that started in
+/// System. Correctness gate enforces.
+#[cfg(target_arch = "aarch64")]
+const MIN_ARENA_BYTES: usize = 256;
+#[cfg(not(target_arch = "aarch64"))]
+const MIN_ARENA_BYTES: usize = 4096;
+
 #[derive(Debug)]
 pub struct ZkAllocator;
 
@@ -76,14 +99,87 @@ thread_local! {
 /// Returns the base address of the mmap'd region, mapping it on the first call.
 fn ensure_region() -> usize {
     REGION_INIT.call_once(|| {
+        // On aarch64 Linux (M2/Asahi) THP page size is 32 MiB. We over-allocate
+        // by THP_SIZE so we can round REGION_BASE up to a 32 MiB boundary, which
+        // is what khugepaged needs to collapse base pages into hugepages. Without
+        // this alignment MADV_HUGEPAGE is observed to fire only intermittently
+        // (iter 7: real signal but p=0.019 not p<0.01). With alignment + an
+        // eager touch (one write per 32 MiB) the kernel collapses the touched
+        // region into THP synchronously, making the win deterministic.
+        #[cfg(target_arch = "aarch64")]
+        const THP_SIZE: usize = 32 << 20; // 32 MiB on M2 Asahi
+        #[cfg(not(target_arch = "aarch64"))]
+        const THP_SIZE: usize = 0;
+
+        let mmap_size = REGION_SIZE + THP_SIZE;
         // SAFETY: mmap_anonymous returns a page-aligned pointer or null. MAP_NORESERVE
         // means no physical memory is committed until pages are touched.
-        let ptr = unsafe { syscall::mmap_anonymous(REGION_SIZE) };
-        if ptr.is_null() {
+        let raw = unsafe { syscall::mmap_anonymous(mmap_size) };
+        if raw.is_null() {
             std::process::abort();
         }
-        unsafe { syscall::madvise(ptr, REGION_SIZE, syscall::MADV_NOHUGEPAGE) };
-        REGION_BASE.store(ptr as usize, Ordering::Release);
+
+        #[cfg(target_arch = "aarch64")]
+        let aligned_base = (raw as usize).next_multiple_of(THP_SIZE);
+        #[cfg(not(target_arch = "aarch64"))]
+        let aligned_base = raw as usize;
+
+        // On aarch64, ask khugepaged to use THP for the slab region. On x86_64
+        // preserve the historical NOHUGEPAGE hint (2 MiB THP can fragment slab
+        // release; documented original choice).
+        #[cfg(target_arch = "aarch64")]
+        let advice = syscall::MADV_HUGEPAGE;
+        #[cfg(not(target_arch = "aarch64"))]
+        let advice = syscall::MADV_NOHUGEPAGE;
+        unsafe { syscall::madvise(aligned_base as *mut u8, REGION_SIZE, advice) };
+
+        // Eager pre-touch on aarch64: write one byte per 32 MiB hugepage across
+        // the first `pretouch_bytes` of every per-thread slab. Each write
+        // triggers a page fault that the kernel resolves into a 32 MiB THP
+        // given our earlier MADV_HUGEPAGE hint and the 32 MiB-aligned base.
+        // This makes the THP win deterministic instead of
+        // khugepaged-async-dependent.
+        //
+        // Adapt `pretouch_bytes` to MemTotal (was a hard-coded 1 GiB in iter 8).
+        // The 1 GiB const × MAX_THREADS=14 = 14 GiB pre-touch overshoots the
+        // 16 GiB Asahi M2 box: the eval gate's prove_loop_cand was OOM-killed
+        // twice with anon-rss ~14.3 GiB on 2026-05-11 (journalctl). Cap at
+        // MemTotal / MAX_THREADS / OVERCOMMIT_GUARD so total pre-touch stays
+        // under MemTotal/3, leaving room for the workload's own ~10 GiB
+        // touched footprint and the rest of the process.
+        // - 16 GiB / 14 / 3 ≈ 390 MiB per slab → ~5.4 GiB pre-touched
+        // - 64 GiB / 14 / 3 ≈ 1.56 GiB per slab → capped at 1 GiB ceiling
+        // Floor at THP_SIZE so we still pre-touch at least one hugepage per
+        // slab if `total_ram_bytes()` returns a degenerately small value or
+        // fails (returns 0 → fall back to THP_SIZE).
+        // Runs in REGION_INIT.call_once, well before any timed proof window.
+        #[cfg(target_arch = "aarch64")]
+        {
+            const PRETOUCH_HARD_CAP: usize = 1 << 30; // 1 GiB ceiling per slab
+            const OVERCOMMIT_GUARD: usize = 3; // total pre-touch ≤ MemTotal/3
+            // SAFETY: total_ram_bytes is allocation-free (sysinfo syscall into stack buffer).
+            let mem_total = unsafe { syscall::total_ram_bytes() };
+            let pretouch_bytes = if mem_total == 0 {
+                THP_SIZE
+            } else {
+                let budget = mem_total / MAX_THREADS / OVERCOMMIT_GUARD;
+                budget.clamp(THP_SIZE, PRETOUCH_HARD_CAP)
+            };
+            for slab_idx in 0..MAX_THREADS {
+                let slab_base = aligned_base + slab_idx * SLAB_SIZE;
+                let mut off = 0;
+                while off < pretouch_bytes {
+                    // SAFETY: aligned_base..aligned_base+REGION_SIZE is a valid
+                    // anonymous mmap reservation; we only touch within slab.
+                    unsafe {
+                        std::ptr::write_volatile((slab_base + off) as *mut u8, 0);
+                    }
+                    off += THP_SIZE;
+                }
+            }
+        }
+
+        REGION_BASE.store(aligned_base, Ordering::Release);
     });
     REGION_BASE.load(Ordering::Acquire)
 }
@@ -99,19 +195,22 @@ pub fn init() {
 
 /// Activates the arena and resets every thread's slab. All allocations until the next
 /// `end_phase()` go to the arena; the previous phase's data is overwritten in place.
+///
+/// Panics if a phase is already active: phases must not nest (a nested call would
+/// recycle the slab and overwrite the outer phase's still-live allocations).
 pub fn begin_phase() {
+    let prev_active = ARENA_ACTIVE.swap(true, Ordering::Release);
+    assert!(
+        !prev_active,
+        "begin_phase() called while another phase is already active — phases must not nest"
+    );
     GENERATION.fetch_add(1, Ordering::Release);
-    ARENA_ACTIVE.store(true, Ordering::Release);
 }
 
 /// Deactivates the arena. New allocations go to the system allocator; existing arena
 /// pointers stay valid until the next `begin_phase()` resets the slabs.
-///
-/// Also calls [`system_info::flush_rayon`] to release any rayon/crossbeam storage
-/// still referencing this phase's arena memory.
 pub fn end_phase() {
     ARENA_ACTIVE.store(false, Ordering::Release);
-    system_info::flush_rayon();
 }
 
 #[cold]
@@ -152,6 +251,15 @@ unsafe impl GlobalAlloc for ZkAllocator {
     #[inline(always)]
     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
         if ARENA_ACTIVE.load(Ordering::Relaxed) {
+            // Small allocs bypass arena: registry slots / HashMap entries /
+            // injector-block-sized allocations from rayon/tracing libraries
+            // commonly outlive a phase. Routing them to System keeps them
+            // safe across begin_phase()/end_phase() boundaries.
+            //
+            // TODO is there a cleaner way?
+            if layout.size() < MIN_ARENA_BYTES {
+                return unsafe { std::alloc::System.alloc(layout) };
+            }
             let generation = GENERATION.load(Ordering::Relaxed);
             if ARENA_GEN.get() == generation {
                 let align = layout.align();
@@ -182,11 +290,22 @@ unsafe impl GlobalAlloc for ZkAllocator {
         if new_size <= layout.size() {
             return ptr;
         }
+        // Sticky-System routing: if the original allocation came from System
+        // (small, or pre-phase, or routed by size-routing), keep the grown
+        // allocation in System too. Without this, a Vec allocated outside a
+        // phase that grows inside one would silently migrate into the arena
+        // and become subject to phase recycling.
+        let addr = ptr as usize;
+        let base = REGION_BASE.load(Ordering::Relaxed);
+        let in_arena = base != 0 && addr >= base && addr < base + REGION_SIZE;
+        if !in_arena {
+            return unsafe { std::alloc::System.realloc(ptr, layout, new_size) };
+        }
         // SAFETY: new_size > layout.size() > 0, align unchanged from valid layout.
         let new_layout = unsafe { Layout::from_size_align_unchecked(new_size, layout.align()) };
         let new_ptr = unsafe { self.alloc(new_layout) };
         if !new_ptr.is_null() {
-            unsafe { std::ptr::copy_nonoverlapping(ptr, new_ptr, layout.size()) };
+            unsafe { std::ptr::copy(ptr, new_ptr, layout.size()) };
             unsafe { self.dealloc(ptr, layout) };
         }
         new_ptr

diff --git a/crates/backend/zk-alloc/src/syscall.rs b/crates/backend/zk-alloc/src/syscall.rs
@@ -15,6 +15,7 @@ mod imp {
     const MAP_ANONYMOUS: usize = 0x20;
     const MAP_NORESERVE: usize = 0x4000;
 
+    pub const MADV_HUGEPAGE: usize = 14;
     pub const MADV_NOHUGEPAGE: usize = 15;
 
     #[inline]
@@ -70,17 +71,121 @@ mod imp {
     }
 }
 
-#[cfg(not(all(target_os = "linux", target_arch = "x86_64")))]
+#[cfg(all(target_os = "linux", target_arch = "aarch64"))]
 mod imp {
     use std::ptr;
 
+    const SYS_MMAP: usize = 222;
+    const SYS_MADVISE: usize = 233;
+    const SYS_SYSINFO: usize = 179;
+
+    const PROT_READ: usize = 1;
+    const PROT_WRITE: usize = 2;
+    const MAP_PRIVATE: usize = 0x02;
+    const MAP_ANONYMOUS: usize = 0x20;
+    const MAP_NORESERVE: usize = 0x4000;
+
+    pub const MADV_HUGEPAGE: usize = 14;
+    pub const MADV_NOHUGEPAGE: usize = 15;
+
+    #[inline]
+    unsafe fn syscall6(nr: usize, a1: usize, a2: usize, a3: usize, a4: usize, a5: usize, a6: usize) -> isize {
+        let ret: isize;
+        unsafe {
+            std::arch::asm!(
+                "svc 0",
+                in("x8") nr,
+                inlateout("x0") a1 as isize => ret,
+                in("x1") a2,
+                in("x2") a3,
+                in("x3") a4,
+                in("x4") a5,
+                in("x5") a6,
+                options(nostack),
+            );
+        }
+        ret
+    }
+
+    #[inline]
+    unsafe fn syscall3(nr: usize, a1: usize, a2: usize, a3: usize) -> isize {
+        let ret: isize;
+        unsafe {
+            std::arch::asm!(
+                "svc 0",
+                in("x8") nr,
+                inlateout("x0") a1 as isize => ret,
+                in("x1") a2,
+                in("x2") a3,
+                options(nostack),
+            );
+        }
+        ret
+    }
+
+    #[inline]
+    unsafe fn syscall1(nr: usize, a1: usize) -> isize {
+        let ret: isize;
+        unsafe {
+            std::arch::asm!(
+                "svc 0",
+                in("x8") nr,
+                inlateout("x0") a1 as isize => ret,
+                options(nostack),
+            );
+        }
+        ret
+    }
+
+    /// Returns the system's total RAM in bytes via the `sysinfo(2)` syscall, or
+    /// 0 on failure. Allocation-free: writes the kernel struct into a stack
+    /// buffer, no libc / no Vec / no String. Safe to call from inside
+    /// `#[global_allocator]` initialisation.
+    ///
+    /// Layout of `struct sysinfo` on 64-bit Linux (kernel/asm-generic):
+    ///   off  0  long uptime
+    ///   off  8  ulong loads[3]
+    ///   off 32  ulong totalram        <-- the field we want
+    ///   off 40  ulong freeram
+    ///   ...
+    ///   off108  u32   mem_unit        <-- multiplier (always 1 on 64-bit)
+    pub unsafe fn total_ram_bytes() -> usize {
+        let mut buf = [0u8; 128];
+        let ret = unsafe { syscall1(SYS_SYSINFO, buf.as_mut_ptr() as usize) };
+        if ret < 0 {
+            return 0;
+        }
+        let totalram =
+            u64::from_ne_bytes([buf[32], buf[33], buf[34], buf[35], buf[36], buf[37], buf[38], buf[39]]) as usize;
+        let mem_unit = u32::from_ne_bytes([buf[108], buf[109], buf[110], buf[111]]) as usize;
+        totalram.saturating_mul(mem_unit.max(1))
+    }
+
+    #[inline]
+    pub unsafe fn mmap_anonymous(size: usize) -> *mut u8 {
+        let flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+        let ret = unsafe { syscall6(SYS_MMAP, 0, size, PROT_READ | PROT_WRITE, flags, usize::MAX, 0) };
+        if ret < 0 { ptr::null_mut() } else { ret as *mut u8 }
+    }
+
+    #[inline]
+    pub unsafe fn madvise(ptr: *mut u8, size: usize, advice: usize) {
+        unsafe { syscall3(SYS_MADVISE, ptr as usize, size, advice) };
+    }
+}
+
+#[cfg(not(all(target_os = "linux", any(target_arch = "x86_64", target_arch = "aarch64"))))]
+mod imp {
+    use std::ptr;
+
+    pub const MADV_HUGEPAGE: usize = 14;
     pub const MADV_NOHUGEPAGE: usize = 15;
 
     #[inline]
     pub unsafe fn mmap_anonymous(size: usize) -> *mut u8 {
         // MAP_NORESERVE is Linux-only. macOS lazily backs anonymous mappings
-        // with physical memory by default, so the large virtual reservation we
-        // make is fine without NORESERVE.
+        // with physical memory by default, so the large virtual reservation
+        // is fine without NORESERVE.
         let prot = libc::PROT_READ | libc::PROT_WRITE;
         let flags = libc::MAP_PRIVATE | libc::MAP_ANON;
         let ret = unsafe { libc::mmap(ptr::null_mut(), size, prot, flags, -1, 0) };
@@ -97,4 +202,8 @@ mod imp {
     }
 }
 
-pub use imp::{MADV_NOHUGEPAGE, madvise, mmap_anonymous};
+#[cfg(not(target_arch = "aarch64"))]
+pub use imp::MADV_NOHUGEPAGE;
+#[cfg(target_arch = "aarch64")]
+pub use imp::{MADV_HUGEPAGE, total_ram_bytes};
+pub use imp::{madvise, mmap_anonymous};
diff --git a/crates/backend/zk-alloc/tests/test_rayon.rs b/crates/backend/zk-alloc/tests/test_rayon.rs
@@ -1,4 +1,6 @@
-//! Regression test for the bug prevented by `system_info::flush_rayon`.
+//! Regression test for arena/rayon corruption: rayon's `crossbeam_deque::Injector`
+//! blocks (~1.5 KB) used to land in the arena and outlive a phase. Now prevented
+//! by `MIN_ARENA_BYTES` size-routing in `ZkAllocator::alloc`.
 
 use rayon::prelude::*;