Skip to content

Commit cb40600

Browse files
committed
perf: increase thread-local heap capacity to reduce global refills
Increase kMaxMiniheapsPerShuffleVector from 24 to 48 and kMiniheapRefillGoalSize from 4KB to 16KB to amortize the cost of global heap operations. In workloads with heavy thread churn (like larson benchmark), threads frequently need to re-attach miniheaps when their shuffle vectors are exhausted. Each global refill requires lock acquisition and freelist walks. The key insight is that kMaxMiniheapsPerShuffleVector is the dominant factor: more attached miniheaps means more allocation capacity before needing another global refill. For small 16-byte objects with 256 per miniheap, 48 miniheaps provides ~12K allocations before refill vs ~6K with the previous 24 miniheaps. Testing showed diminishing returns on goal size once it exceeds what the miniheaps array can provide, so 16KB (vs 256KB) captures most of the benefit with lower RSS overhead. Results on larson benchmark: relative time improved from ~33.5s to ~15.7s (2.1x faster), throughput from ~29M to ~64M ops/sec.
1 parent 31e8a06 commit cb40600

File tree

7 files changed

+138
-115
lines changed

7 files changed

+138
-115
lines changed

src/common.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,13 @@ static constexpr size_t kMinArenaExpansion = 4096; // 4096 pages (16 MB on 4KB
163163

164164
// ensures we amortize the cost of going to the global heap enough
165165
static constexpr uint64_t kMinStringLen = 8;
166-
static constexpr size_t kMiniheapRefillGoalSize = 4 * 1024;
167-
static constexpr size_t kMaxMiniheapsPerShuffleVector = 24;
166+
// Increased from 4KB to 16KB to reduce frequency of global refills.
167+
// Each refill grabs more capacity, trading some RSS for fewer lock acquisitions.
168+
static constexpr size_t kMiniheapRefillGoalSize = 16 * 1024;
169+
// Increased from 24 to 48 - this is the key driver of performance improvement.
170+
// More attached miniheaps means more capacity before needing global refills.
171+
// For small objects (256/miniheap): 48 miniheaps = 12K allocations before refill.
172+
static constexpr size_t kMaxMiniheapsPerShuffleVector = 48;
168173

169174
// shuffle vector features
170175
static constexpr int16_t kMaxShuffleVectorLength = 1024; // increased to support 16KB pages with 16-byte objects

src/meshable_arena.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ void MeshableArena<PageSize>::afterForkChild() {
134134
}
135135
#endif
136136

137-
void *remapPtr = mmap(remove, sz, HL_MMAP_PROTECTION_MASK, kMapShared | MAP_FIXED, newFd, keepOff << kPageShift);
137+
void *remapPtr =
138+
mmap(remove, sz, HL_MMAP_PROTECTION_MASK, kMapShared | MAP_FIXED, newFd, keepOff << kPageShift);
138139
hard_assert_msg(remapPtr != MAP_FAILED, "mesh remap failed: %d", errno);
139140

140141
return false;

src/meshable_arena.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -386,8 +386,7 @@ class MeshableArena : public mesh::OneWayMmapHeap {
386386
// Implementation
387387

388388
template <size_t PageSize>
389-
MeshableArena<PageSize>::MeshableArena()
390-
: SuperHeap(), _fastPrng(internal::seed(), internal::seed()) {
389+
MeshableArena<PageSize>::MeshableArena() : SuperHeap(), _fastPrng(internal::seed(), internal::seed()) {
391390
d_assert(getArenaInstance<PageSize>() == nullptr);
392391
getArenaInstance<PageSize>() = this;
393392

src/mini_heap.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,6 @@ class MiniHeap {
196196
d_assert(!_nextMeshed.hasValue());
197197
}
198198

199-
200199
inline Span span() const {
201200
return _span;
202201
}

src/size_class_reciprocals.h

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -32,31 +32,31 @@ static constexpr size_t kClassSizesMax = 25;
3232
namespace float_recip {
3333

3434
inline constexpr float kReciprocals[kClassSizesMax] = {
35-
1.0f / 16.0f, // class 0: 16 bytes
36-
1.0f / 16.0f, // class 1: 16 bytes
37-
1.0f / 32.0f, // class 2: 32 bytes
38-
1.0f / 48.0f, // class 3: 48 bytes
39-
1.0f / 64.0f, // class 4: 64 bytes
40-
1.0f / 80.0f, // class 5: 80 bytes
41-
1.0f / 96.0f, // class 6: 96 bytes
42-
1.0f / 112.0f, // class 7: 112 bytes
43-
1.0f / 128.0f, // class 8: 128 bytes
44-
1.0f / 160.0f, // class 9: 160 bytes
45-
1.0f / 192.0f, // class 10: 192 bytes
46-
1.0f / 224.0f, // class 11: 224 bytes
47-
1.0f / 256.0f, // class 12: 256 bytes
48-
1.0f / 320.0f, // class 13: 320 bytes
49-
1.0f / 384.0f, // class 14: 384 bytes
50-
1.0f / 448.0f, // class 15: 448 bytes
51-
1.0f / 512.0f, // class 16: 512 bytes
52-
1.0f / 640.0f, // class 17: 640 bytes
53-
1.0f / 768.0f, // class 18: 768 bytes
54-
1.0f / 896.0f, // class 19: 896 bytes
55-
1.0f / 1024.0f, // class 20: 1024 bytes
56-
1.0f / 2048.0f, // class 21: 2048 bytes
57-
1.0f / 4096.0f, // class 22: 4096 bytes
58-
1.0f / 8192.0f, // class 23: 8192 bytes
59-
1.0f / 16384.0f, // class 24: 16384 bytes
35+
1.0f / 16.0f, // class 0: 16 bytes
36+
1.0f / 16.0f, // class 1: 16 bytes
37+
1.0f / 32.0f, // class 2: 32 bytes
38+
1.0f / 48.0f, // class 3: 48 bytes
39+
1.0f / 64.0f, // class 4: 64 bytes
40+
1.0f / 80.0f, // class 5: 80 bytes
41+
1.0f / 96.0f, // class 6: 96 bytes
42+
1.0f / 112.0f, // class 7: 112 bytes
43+
1.0f / 128.0f, // class 8: 128 bytes
44+
1.0f / 160.0f, // class 9: 160 bytes
45+
1.0f / 192.0f, // class 10: 192 bytes
46+
1.0f / 224.0f, // class 11: 224 bytes
47+
1.0f / 256.0f, // class 12: 256 bytes
48+
1.0f / 320.0f, // class 13: 320 bytes
49+
1.0f / 384.0f, // class 14: 384 bytes
50+
1.0f / 448.0f, // class 15: 448 bytes
51+
1.0f / 512.0f, // class 16: 512 bytes
52+
1.0f / 640.0f, // class 17: 640 bytes
53+
1.0f / 768.0f, // class 18: 768 bytes
54+
1.0f / 896.0f, // class 19: 896 bytes
55+
1.0f / 1024.0f, // class 20: 1024 bytes
56+
1.0f / 2048.0f, // class 21: 2048 bytes
57+
1.0f / 4096.0f, // class 22: 4096 bytes
58+
1.0f / 8192.0f, // class 23: 8192 bytes
59+
1.0f / 16384.0f, // class 24: 16384 bytes
6060
};
6161

6262
// Compute object index from byte offset using float reciprocal

0 commit comments

Comments
 (0)