Skip to content
Closed
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion crates/backend/system-info/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ edition.workspace = true

[dependencies]
libc = "0.2"
rayon.workspace = true

[lints]
workspace = true
33 changes: 0 additions & 33 deletions crates/backend/system-info/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,3 @@ pub fn peak_rss_bytes() -> u64 {
// ru_maxrss unit: bytes on macOS, KiB on Linux.
if cfg!(target_os = "macos") { max } else { max * 1024 }
}

/// Number of jobs [`flush_rayon`] pushes. Must exceed
/// `crossbeam_deque::deque::BLOCK_CAP` (currently 63 —
/// `crossbeam-deque-0.8.6/src/deque.rs:1191`).
const RAYON_FLUSH_JOBS: usize = 256;

/// Drain rayon's internal queues so they release any storage allocated during the
/// previous phase.
///
/// Rayon's global pool owns a `crossbeam_deque::Injector`, internally a linked list
/// of fixed-size blocks (`Block` and `Injector::push` —
/// `crossbeam-deque-0.8.6/src/deque.rs:1219` and `:1371`). A block is freed only
/// once its last slot has been consumed.
///
/// `rayon::join` from a non-worker thread reaches that injector via
/// `join` (`rayon-core-1.13.0/src/join/mod.rs:132`) ->
/// `registry::in_worker` (`registry.rs:946`) ->
/// `Registry::in_worker_cold` (`:517`) ->
/// `Registry::inject` (`:428`) -> `Injector::push`.
///
/// Under an arena allocator that recycles memory between phases (e.g. `zk-alloc`),
/// a block allocated *during* a phase points into a slab the next `begin_phase()`
/// will reuse. The next push then writes a `JobRef` straight through whatever the
/// application has placed on top, silently corrupting it.
///
/// Pushing more than `BLOCK_CAP` jobs while the arena is off forces the Injector
/// to allocate a fresh tail block (which lands in System), and forces workers to
/// steal the last slot of every preceding block (which destroys them).
pub fn flush_rayon() {
for _ in 0..RAYON_FLUSH_JOBS {
rayon::join(|| {}, || {});
}
}
139 changes: 129 additions & 10 deletions crates/backend/zk-alloc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,29 @@ const SLACK: usize = 4; // SLACK absorbs the main thread and any non-rayon helpe
const MAX_THREADS: usize = NUM_THREADS + SLACK;
const REGION_SIZE: usize = SLAB_SIZE * MAX_THREADS;

/// Allocations smaller than this go to System even during active phases.
/// Routes registry / hashmap / injector-block-sized allocations away from the
/// arena, so library state that outlives a phase doesn't land in recycled
/// memory. Covers the known phase-crossing patterns: crossbeam_deque::Injector
/// blocks (~1.5 KB), tracing-subscriber Registry slot data (sub-KB), hashbrown
/// HashMap entries (sub-KB), rayon-core job stack frames (sub-KB).
///
/// TODO is there a cleaner way?
///
/// Lowered from 4096 to 256 on M2 once THP-backed arena landed (iter 8): allocs
/// in the arena now hit a 32 MiB hugepage TLB entry whereas System allocs land
/// on 16 KiB base pages. Pushing the 256..4095 size band into the arena buys
/// the hugepage TLB benefit for more allocations. Phase-crossing safety: the
/// named ~1.5 KB Injector blocks still bypass via System (still in the
/// [0, 256) carve-out? No — Injector blocks are ~1.5 KB > 256). Risk: any
/// phase-crossing allocation in [256, 1500) is now in the arena and gets
/// recycled. Sticky-System realloc still protects grown Vecs that started in
/// System. Correctness gate enforces.
#[cfg(target_arch = "aarch64")]
const MIN_ARENA_BYTES: usize = 256;
#[cfg(not(target_arch = "aarch64"))]
const MIN_ARENA_BYTES: usize = 4096;

#[derive(Debug)]
pub struct ZkAllocator;

Expand Down Expand Up @@ -76,14 +99,87 @@ thread_local! {
/// Returns the base address of the mmap'd region, mapping it on the first call.
fn ensure_region() -> usize {
REGION_INIT.call_once(|| {
// On aarch64 Linux (M2/Asahi) THP page size is 32 MiB. We over-allocate
// by THP_SIZE so we can round REGION_BASE up to a 32 MiB boundary, which
// is what khugepaged needs to collapse base pages into hugepages. Without
// this alignment MADV_HUGEPAGE is observed to fire only intermittently
// (iter 7: real signal but p=0.019 not p<0.01). With alignment + an
// eager touch (one write per 32 MiB) the kernel collapses the touched
// region into THP synchronously, making the win deterministic.
#[cfg(target_arch = "aarch64")]
const THP_SIZE: usize = 32 << 20; // 32 MiB on M2 Asahi
#[cfg(not(target_arch = "aarch64"))]
const THP_SIZE: usize = 0;

let mmap_size = REGION_SIZE + THP_SIZE;
// SAFETY: mmap_anonymous returns a page-aligned pointer or null. MAP_NORESERVE
// means no physical memory is committed until pages are touched.
let ptr = unsafe { syscall::mmap_anonymous(REGION_SIZE) };
if ptr.is_null() {
let raw = unsafe { syscall::mmap_anonymous(mmap_size) };
if raw.is_null() {
std::process::abort();
}
unsafe { syscall::madvise(ptr, REGION_SIZE, syscall::MADV_NOHUGEPAGE) };
REGION_BASE.store(ptr as usize, Ordering::Release);

#[cfg(target_arch = "aarch64")]
let aligned_base = (raw as usize).next_multiple_of(THP_SIZE);
#[cfg(not(target_arch = "aarch64"))]
let aligned_base = raw as usize;

// On aarch64, ask khugepaged to use THP for the slab region. On x86_64
// preserve the historical NOHUGEPAGE hint (2 MiB THP can fragment slab
// release; documented original choice).
#[cfg(target_arch = "aarch64")]
let advice = syscall::MADV_HUGEPAGE;
#[cfg(not(target_arch = "aarch64"))]
let advice = syscall::MADV_NOHUGEPAGE;
unsafe { syscall::madvise(aligned_base as *mut u8, REGION_SIZE, advice) };

// Eager pre-touch on aarch64: write one byte per 32 MiB hugepage across
// the first `pretouch_bytes` of every per-thread slab. Each write
// triggers a page fault that the kernel resolves into a 32 MiB THP
// given our earlier MADV_HUGEPAGE hint and the 32 MiB-aligned base.
// This makes the THP win deterministic instead of
// khugepaged-async-dependent.
//
// Adapt `pretouch_bytes` to MemTotal (was a hard-coded 1 GiB in iter 8).
// The 1 GiB const × MAX_THREADS=14 = 14 GiB pre-touch overshoots the
// 16 GiB Asahi M2 box: the eval gate's prove_loop_cand was OOM-killed
// twice with anon-rss ~14.3 GiB on 2026-05-11 (journalctl). Cap at
// MemTotal / MAX_THREADS / OVERCOMMIT_GUARD so total pre-touch stays
// under MemTotal/3, leaving room for the workload's own ~10 GiB
// touched footprint and the rest of the process.
// - 16 GiB / 14 / 3 ≈ 390 MiB per slab → ~5.4 GiB pre-touched
// - 64 GiB / 14 / 3 ≈ 1.56 GiB per slab → capped at 1 GiB ceiling
// Floor at THP_SIZE so we still pre-touch at least one hugepage per
// slab if `total_ram_bytes()` returns a degenerately small value or
// fails (returns 0 → fall back to THP_SIZE).
// Runs in REGION_INIT.call_once, well before any timed proof window.
#[cfg(target_arch = "aarch64")]
{
const PRETOUCH_HARD_CAP: usize = 1 << 30; // 1 GiB ceiling per slab
const OVERCOMMIT_GUARD: usize = 3; // total pre-touch ≤ MemTotal/3
// SAFETY: total_ram_bytes is allocation-free (sysinfo syscall into stack buffer).
let mem_total = unsafe { syscall::total_ram_bytes() };
let pretouch_bytes = if mem_total == 0 {
THP_SIZE
} else {
let budget = mem_total / MAX_THREADS / OVERCOMMIT_GUARD;
budget.clamp(THP_SIZE, PRETOUCH_HARD_CAP)
};
for slab_idx in 0..MAX_THREADS {
let slab_base = aligned_base + slab_idx * SLAB_SIZE;
let mut off = 0;
while off < pretouch_bytes {
// SAFETY: aligned_base..aligned_base+REGION_SIZE is a valid
// anonymous mmap reservation; we only touch within slab.
unsafe {
std::ptr::write_volatile((slab_base + off) as *mut u8, 0);
}
off += THP_SIZE;
}
}
}

REGION_BASE.store(aligned_base, Ordering::Release);
});
REGION_BASE.load(Ordering::Acquire)
}
Expand All @@ -99,19 +195,22 @@ pub fn init() {

/// Activates the arena and resets every thread's slab. All allocations until the next
/// `end_phase()` go to the arena; the previous phase's data is overwritten in place.
///
/// Panics if a phase is already active: phases must not nest (a nested call would
/// recycle the slab and overwrite the outer phase's still-live allocations).
pub fn begin_phase() {
let prev_active = ARENA_ACTIVE.swap(true, Ordering::Release);
assert!(
!prev_active,
"begin_phase() called while another phase is already active — phases must not nest"
);
GENERATION.fetch_add(1, Ordering::Release);
ARENA_ACTIVE.store(true, Ordering::Release);
}

/// Deactivates the arena. New allocations go to the system allocator; existing arena
/// pointers stay valid until the next `begin_phase()` resets the slabs.
///
/// Also calls [`system_info::flush_rayon`] to release any rayon/crossbeam storage
/// still referencing this phase's arena memory.
pub fn end_phase() {
ARENA_ACTIVE.store(false, Ordering::Release);
system_info::flush_rayon();
}

#[cold]
Expand Down Expand Up @@ -152,6 +251,15 @@ unsafe impl GlobalAlloc for ZkAllocator {
#[inline(always)]
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
if ARENA_ACTIVE.load(Ordering::Relaxed) {
// Small allocs bypass arena: registry slots / HashMap entries /
// injector-block-sized allocations from rayon/tracing libraries
// commonly outlive a phase. Routing them to System keeps them
// safe across begin_phase()/end_phase() boundaries.
//
// TODO is there a cleaner way?
if layout.size() < MIN_ARENA_BYTES {
return unsafe { std::alloc::System.alloc(layout) };
}
let generation = GENERATION.load(Ordering::Relaxed);
if ARENA_GEN.get() == generation {
let align = layout.align();
Expand Down Expand Up @@ -182,11 +290,22 @@ unsafe impl GlobalAlloc for ZkAllocator {
if new_size <= layout.size() {
return ptr;
}
// Sticky-System routing: if the original allocation came from System
// (small, or pre-phase, or routed by size-routing), keep the grown
// allocation in System too. Without this, a Vec allocated outside a
// phase that grows inside one would silently migrate into the arena
// and become subject to phase recycling.
let addr = ptr as usize;
let base = REGION_BASE.load(Ordering::Relaxed);
let in_arena = base != 0 && addr >= base && addr < base + REGION_SIZE;
if !in_arena {
return unsafe { std::alloc::System.realloc(ptr, layout, new_size) };
}
// SAFETY: new_size > layout.size() > 0, align unchanged from valid layout.
let new_layout = unsafe { Layout::from_size_align_unchecked(new_size, layout.align()) };
let new_ptr = unsafe { self.alloc(new_layout) };
if !new_ptr.is_null() {
unsafe { std::ptr::copy_nonoverlapping(ptr, new_ptr, layout.size()) };
unsafe { std::ptr::copy(ptr, new_ptr, layout.size()) };
unsafe { self.dealloc(ptr, layout) };
}
new_ptr
Expand Down
117 changes: 113 additions & 4 deletions crates/backend/zk-alloc/src/syscall.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mod imp {
const MAP_ANONYMOUS: usize = 0x20;
const MAP_NORESERVE: usize = 0x4000;

pub const MADV_HUGEPAGE: usize = 14;
pub const MADV_NOHUGEPAGE: usize = 15;

#[inline]
Expand Down Expand Up @@ -70,17 +71,121 @@ mod imp {
}
}

#[cfg(not(all(target_os = "linux", target_arch = "x86_64")))]
#[cfg(all(target_os = "linux", target_arch = "aarch64"))]
mod imp {
use std::ptr;

const SYS_MMAP: usize = 222;
const SYS_MADVISE: usize = 233;
const SYS_SYSINFO: usize = 179;

const PROT_READ: usize = 1;
const PROT_WRITE: usize = 2;
const MAP_PRIVATE: usize = 0x02;
const MAP_ANONYMOUS: usize = 0x20;
const MAP_NORESERVE: usize = 0x4000;

pub const MADV_HUGEPAGE: usize = 14;
pub const MADV_NOHUGEPAGE: usize = 15;

#[inline]
unsafe fn syscall6(nr: usize, a1: usize, a2: usize, a3: usize, a4: usize, a5: usize, a6: usize) -> isize {
let ret: isize;
unsafe {
std::arch::asm!(
"svc 0",
in("x8") nr,
inlateout("x0") a1 as isize => ret,
in("x1") a2,
in("x2") a3,
in("x3") a4,
in("x4") a5,
in("x5") a6,
options(nostack),
);
}
ret
}

#[inline]
unsafe fn syscall3(nr: usize, a1: usize, a2: usize, a3: usize) -> isize {
let ret: isize;
unsafe {
std::arch::asm!(
"svc 0",
in("x8") nr,
inlateout("x0") a1 as isize => ret,
in("x1") a2,
in("x2") a3,
options(nostack),
);
}
ret
}

#[inline]
unsafe fn syscall1(nr: usize, a1: usize) -> isize {
let ret: isize;
unsafe {
std::arch::asm!(
"svc 0",
in("x8") nr,
inlateout("x0") a1 as isize => ret,
options(nostack),
);
}
ret
}

/// Returns the system's total RAM in bytes via the `sysinfo(2)` syscall, or
/// 0 on failure. Allocation-free: writes the kernel struct into a stack
/// buffer, no libc / no Vec / no String. Safe to call from inside
/// `#[global_allocator]` initialisation.
///
/// Layout of `struct sysinfo` on 64-bit Linux (kernel/asm-generic):
/// off 0 long uptime
/// off 8 ulong loads[3]
/// off 32 ulong totalram <-- the field we want
/// off 40 ulong freeram
/// ...
/// off108 u32 mem_unit <-- multiplier (always 1 on 64-bit)
pub unsafe fn total_ram_bytes() -> usize {
let mut buf = [0u8; 128];
let ret = unsafe { syscall1(SYS_SYSINFO, buf.as_mut_ptr() as usize) };
if ret < 0 {
return 0;
}
let totalram =
u64::from_ne_bytes([buf[32], buf[33], buf[34], buf[35], buf[36], buf[37], buf[38], buf[39]]) as usize;
let mem_unit = u32::from_ne_bytes([buf[108], buf[109], buf[110], buf[111]]) as usize;
totalram.saturating_mul(mem_unit.max(1))
}

#[inline]
pub unsafe fn mmap_anonymous(size: usize) -> *mut u8 {
let flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
let ret = unsafe { syscall6(SYS_MMAP, 0, size, PROT_READ | PROT_WRITE, flags, usize::MAX, 0) };
if ret < 0 { ptr::null_mut() } else { ret as *mut u8 }
}

#[inline]
pub unsafe fn madvise(ptr: *mut u8, size: usize, advice: usize) {
unsafe { syscall3(SYS_MADVISE, ptr as usize, size, advice) };
}
}

#[cfg(not(all(target_os = "linux", any(target_arch = "x86_64", target_arch = "aarch64"))))]
mod imp {
use std::ptr;

pub const MADV_HUGEPAGE: usize = 14;
pub const MADV_NOHUGEPAGE: usize = 15;

#[inline]
pub unsafe fn mmap_anonymous(size: usize) -> *mut u8 {
// MAP_NORESERVE is Linux-only. macOS lazily backs anonymous mappings
// with physical memory by default, so the large virtual reservation we
// make is fine without NORESERVE.
// with physical memory by default, so the large virtual reservation
// is fine without NORESERVE.
let prot = libc::PROT_READ | libc::PROT_WRITE;
let flags = libc::MAP_PRIVATE | libc::MAP_ANON;
let ret = unsafe { libc::mmap(ptr::null_mut(), size, prot, flags, -1, 0) };
Expand All @@ -97,4 +202,8 @@ mod imp {
}
}

pub use imp::{MADV_NOHUGEPAGE, madvise, mmap_anonymous};
#[cfg(not(target_arch = "aarch64"))]
pub use imp::MADV_NOHUGEPAGE;
#[cfg(target_arch = "aarch64")]
pub use imp::{MADV_HUGEPAGE, total_ram_bytes};
pub use imp::{madvise, mmap_anonymous};
4 changes: 3 additions & 1 deletion crates/backend/zk-alloc/tests/test_rayon.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
//! Regression test for the bug prevented by `system_info::flush_rayon`.
//! Regression test for arena/rayon corruption: rayon's `crossbeam_deque::Injector`
//! blocks (~1.5 KB) used to land in the arena and outlive a phase. Now prevented
//! by `MIN_ARENA_BYTES` size-routing in `ZkAllocator::alloc`.

use rayon::prelude::*;

Expand Down
Loading