diff --git a/NOTICE b/NOTICE
new file mode 100644
index 000000000..1ea6ef8bc
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,48 @@
+zvec
+Copyright 2025-present the zvec project
+
+This product is licensed under the Apache License, Version 2.0 (see the LICENSE
+file). It includes third-party software components that are distributed under
+their own licenses, as listed below.
+
+================================================================================
+Third-Party Components
+================================================================================
+
+--------------------------------------------------------------------------------
+pyglass
+--------------------------------------------------------------------------------
+Project: pyglass — Graph Library for Approximate Similarity Search
+Homepage: https://github.com/zilliztech/pyglass
+License:  MIT License
+Used in:  src/core/utility/linear_pool.h
+
+The LinearPool implementation (and the accompanying Neighbor / Bitset helpers)
+in src/core/utility/linear_pool.h is adapted from pyglass, with modifications
+(a BlockHeap-compatible reset()/push_block() interface and the use of
+MemoryHelper for huge-page-backed allocation). The related BlockHeap design in
+src/core/utility/block_heap.{h,cc} is also derived from pyglass.
+
+Original license text:
+
+    MIT License
+
+    Copyright (c) 2023 zh Wang
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
diff --git a/src/ailego/utility/memory_helper.cc b/src/ailego/utility/memory_helper.cc
index eee78e407..168c38b2e 100644
--- a/src/ailego/utility/memory_helper.cc
+++ b/src/ailego/utility/memory_helper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "memory_helper.h"
+#include <cassert>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
@@ -29,6 +30,7 @@
 #include <mach/mach.h>
 #include <sys/sysctl.h>
 #endif
+#include <sys/mman.h>
 #include <unistd.h>
 #endif
 
@@ -391,5 +393,100 @@ size_t MemoryHelper::HugePageSize(void) {
   return page_size;
 }
 
+size_t MemoryHelper::AlignHugePageSize(size_t size) {
+  const size_t page_mask = HugePageSize() - 1;
+  return (size + page_mask) & (~page_mask);
+}
+
+void *MemoryHelper::AllocateHugePage(size_t size, bool zero_fill) {
+  if (size == 0) {
+    return nullptr;
+  }
+  const size_t aligned_size = AlignHugePageSize(size);
+
+#if defined(_WIN64) || defined(_WIN32)
+  void *ptr = ::_aligned_malloc(aligned_size, PageSize());
+  if (ptr == nullptr) {
+    return nullptr;
+  }
+  if (zero_fill) {
+    std::memset(ptr, 0, aligned_size);
+  }
+  return ptr;
+#else
+  void *ptr = ::mmap(nullptr, aligned_size, PROT_READ | PROT_WRITE,
+                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (ptr == MAP_FAILED) {
+    return nullptr;
+  }
+  // MADV_HUGEPAGE is a Linux-only hint for transparent huge pages. On
+  // macOS/BSD (which manage superpages differently) it is intentionally
+  // absent; skipping it only forgoes a performance hint, not correctness.
+#if defined(MADV_HUGEPAGE)
+  ::madvise(ptr, aligned_size, MADV_HUGEPAGE);
+#endif
+  // mmap with MAP_ANONYMOUS already returns zero-filled pages, so an explicit
+  // memset is only needed when the caller relies on it for a non-anonymous
+  // fallback; here it is redundant and skipped to avoid touching every page.
+  (void)zero_fill;
+  return ptr;
+#endif
+}
+
+void MemoryHelper::FreeHugePage(void *ptr, size_t size) {
+  if (ptr == nullptr) {
+    return;
+  }
+#if defined(_WIN64) || defined(_WIN32)
+  (void)size;
+  ::_aligned_free(ptr);
+#else
+  ::munmap(ptr, AlignHugePageSize(size));
+#endif
+}
+
+void *MemoryHelper::AllocateAligned(size_t size, size_t alignment,
+                                    bool zero_fill) {
+  assert(alignment != 0 && (alignment & (alignment - 1)) == 0 &&
+         "alignment must be a power of two");
+  if (size == 0) {
+    return nullptr;
+  }
+  if (size >= HugePageSize()) {
+    return AllocateHugePage(size, zero_fill);
+  }
+
+  // Small block: a regular aligned allocation avoids reserving a whole huge
+  // page. std::aligned_alloc requires the size to be a multiple of alignment.
+  const size_t aligned_size = (size + alignment - 1) / alignment * alignment;
+#if defined(_WIN64) || defined(_WIN32)
+  void *ptr = ::_aligned_malloc(aligned_size, alignment);
+#else
+  void *ptr = std::aligned_alloc(alignment, aligned_size);
+#endif
+  if (ptr == nullptr) {
+    return nullptr;
+  }
+  if (zero_fill) {
+    std::memset(ptr, 0, aligned_size);
+  }
+  return ptr;
+}
+
+void MemoryHelper::FreeAligned(void *ptr, size_t size) {
+  if (ptr == nullptr) {
+    return;
+  }
+  if (size >= HugePageSize()) {
+    FreeHugePage(ptr, size);
+    return;
+  }
+#if defined(_WIN64) || defined(_WIN32)
+  ::_aligned_free(ptr);
+#else
+  std::free(ptr);
+#endif
+}
+
 }  // namespace ailego
-}  // namespace zvec
\ No newline at end of file
+}  // namespace zvec
diff --git a/src/ailego/utility/memory_helper.h b/src/ailego/utility/memory_helper.h
index b6104a842..ee686292b 100644
--- a/src/ailego/utility/memory_helper.h
+++ b/src/ailego/utility/memory_helper.h
@@ -28,6 +28,65 @@ struct MemoryHelper {
   //! Retrieve the huge page size of memory
   static size_t HugePageSize(void);
 
+  //! Round `size` up to a multiple of the huge page size.
+  static size_t AlignHugePageSize(size_t size);
+
+  //! Allocate a large, page-aligned block that prefers transparent huge pages.
+  //!
+  //! On Linux the block is obtained via anonymous mmap and hinted with
+  //! MADV_HUGEPAGE; on other platforms it falls back to a page-aligned
+  //! allocation without the huge-page hint (which is a performance hint, not a
+  //! correctness requirement).  Returns nullptr on failure.
+  //!
+  //! `size` is rounded up to the huge page size internally, and the same
+  //! rounded value is what the corresponding FreeHugePage call expects, so
+  //! callers should treat the returned block as exactly AlignHugePageSize(size)
+  //! bytes.
+  //!
+  //! `zero_fill` requests zeroed memory: when true the returned block is
+  //! guaranteed to be zero-initialized. When false the caller does not require
+  //! zeroing, but the implementation is still free to return zeroed memory and
+  //! does so on the anonymous-mmap path (MAP_ANONYMOUS pages are always zero),
+  //! where an explicit memset is skipped to preserve lazy paging. In other
+  //! words, true => always zeroed; false => zeroing is not guaranteed either
+  //! way. Never assume non-zero contents.
+  //!
+  //! Blocks returned here MUST be released with FreeHugePage (never free()),
+  //! because the underlying allocator differs per platform.
+  static void *AllocateHugePage(size_t size, bool zero_fill = true);
+
+  //! Release a block previously returned by AllocateHugePage.
+  //!
+  //! `size` must be the same value originally passed to AllocateHugePage; it is
+  //! required because the Linux mmap path needs the length for munmap.
+  static void FreeHugePage(void *ptr, size_t size);
+
+  //! Allocate an aligned block, choosing the backing allocator by size.
+  //!
+  //! When `size` is at least the huge page size, the block is obtained via
+  //! AllocateHugePage (huge-page-backed, page-aligned). Otherwise a regular
+  //! `alignment`-aligned allocation is used, which avoids wasting a full huge
+  //! page on small buffers. Returns nullptr on failure.
+  //!
+  //! `alignment` must be a power of two.
+  //!
+  //! `zero_fill` follows the same contract as AllocateHugePage: true guarantees
+  //! zeroed memory; false does not require zeroing but the implementation may
+  //! still return zeroed memory (it does on the huge-page mmap path). Never
+  //! assume non-zero contents.
+  //!
+  //! Blocks returned here MUST be released with FreeAligned, passing the same
+  //! `size`, because the chosen allocator (and therefore the matching free) is
+  //! derived from `size`.
+  static void *AllocateAligned(size_t size, size_t alignment = 64,
+                               bool zero_fill = true);
+
+  //! Release a block previously returned by AllocateAligned.
+  //!
+  //! `size` must be the same value originally passed to AllocateAligned so the
+  //! same allocator path is selected for releasing the block.
+  static void FreeAligned(void *ptr, size_t size);
+
   //! Retrieve the VSZ and RSS of self process in bytes
   static bool SelfUsage(size_t *vsz, size_t *rss);
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 2b676787d..13a974b9a 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -21,6 +21,27 @@ if(RABITQ_SUPPORTED AND AUTO_DETECT_ARCH)
   endforeach()
 endif()
 
+# utility/block_heap.cc uses AVX2 intrinsics guarded by __AVX2__. When the
+# host toolchain supports it, compile this source with an AVX2-capable
+# -march so AVX2 codegen is emitted. zvec_core glob-collects this source
+# too, so per-file flags must be set here as well (in addition to the
+# core_utility target in utility/CMakeLists.txt). Callers runtime-gate
+# invocation of BlockHeap paths on CpuFeatures::AVX2.
+if(NOT ANDROID AND AUTO_DETECT_ARCH)
+  if(HOST_ARCH MATCHES "^(x86|x64)$")
+    setup_compiler_march_for_x86(
+        _BLOCK_HEAP_MARCH_SSE _BLOCK_HEAP_MARCH_AVX2
+        _BLOCK_HEAP_MARCH_AVX512 _BLOCK_HEAP_MARCH_AVX512FP16)
+    if(_BLOCK_HEAP_MARCH_AVX2)
+      set_source_files_properties(
+          utility/block_heap.cc
+          PROPERTIES
+          COMPILE_FLAGS "${_BLOCK_HEAP_MARCH_AVX2}"
+      )
+    endif()
+  endif()
+endif()
+
 cc_directory(framework)
 cc_directory(algorithm)
 cc_directory(metric)
diff --git a/src/core/algorithm/hnsw/CMakeLists.txt b/src/core/algorithm/hnsw/CMakeLists.txt
index cfd1147f4..d59dc8ffc 100644
--- a/src/core/algorithm/hnsw/CMakeLists.txt
+++ b/src/core/algorithm/hnsw/CMakeLists.txt
@@ -10,7 +10,7 @@ cc_library(
     NAME core_knn_hnsw 
     STATIC SHARED STRICT ALWAYS_LINK
     SRCS *.cc
-    LIBS core_framework sparsehash
+    LIBS core_framework core_utility sparsehash
     INCS . ${PROJECT_ROOT_DIR}/src/core ${PROJECT_ROOT_DIR}/src/core/algorithm
     LDFLAGS "${CORE_KNN_HNSW_LDFLAGS}"
     VERSION "${PROXIMA_ZVEC_VERSION}"
diff --git a/src/core/algorithm/hnsw/hnsw_algorithm.cc b/src/core/algorithm/hnsw/hnsw_algorithm.cc
index 75fe5a933..0480cc4f0 100644
--- a/src/core/algorithm/hnsw/hnsw_algorithm.cc
+++ b/src/core/algorithm/hnsw/hnsw_algorithm.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "hnsw_algorithm.h"
+#include <type_traits>
 
 namespace zvec {
 namespace core {
@@ -48,7 +49,7 @@ int HnswAlgorithm<EntityType>::add_node(node_id_t id, level_t level,
 
   for (; cur_level >= 0; --cur_level) {
     search_neighbors(cur_level, &entry_point, &dist, ctx->level_topk(cur_level),
-                     ctx);
+                     ctx, /*use_pool=*/false);
   }
 
   // add neighbors from down level to top level, to avoid upper level visible
@@ -86,7 +87,7 @@ int HnswAlgorithm<EntityType>::search(HnswContext *ctx) const {
 
   auto &topk_heap = ctx->topk_heap();
   topk_heap.clear();
-  search_neighbors(0, &entry_point, &dist, topk_heap, ctx);
+  search_neighbors(0, &entry_point, &dist, topk_heap, ctx, /*use_pool=*/true);
 
   if (ctx->group_by_search()) {
     expand_neighbors_by_group(topk_heap, ctx);
@@ -170,21 +171,117 @@ void HnswAlgorithm<EntityType>::add_neighbors(node_id_t id, level_t level,
   return;
 }
 
-template <typename EntityType>
-void HnswAlgorithm<EntityType>::search_neighbors(level_t level,
-                                                 node_id_t *entry_point,
-                                                 dist_t *dist, TopkHeap &topk,
-                                                 HnswContext *ctx) const {
-  const auto &entity = static_cast<const EntityType &>(ctx->get_entity());
-  HnswDistCalculator &dc = ctx->dist_calculator();
+// ============================================================================
+// search_neighbors helper templates
+//
+// Two specialized inner loops, dispatched from search_neighbors():
+//
+//   fast_search_neighbors:       mmap/contiguous with direct vector pointers.
+//                                Uses BlockHeap (AVX2) or LinearPool (scalar)
+//                                for visited tracking and top-k maintenance.
+//   dual_heap_search_neighbors:  CandidateHeap + TopkHeap + VisitFilter.
+//                                Used for add_node (use_pool=false), filtered
+//                                search, upper levels, and BufferPool fallback.
+// ============================================================================
+
+// mmap/contiguous variant: resolve vectors via get_vector_ptr and use
+// LinearPool or BlockHeap for visited tracking + top-k maintenance.
+// HeapType must expose reset/set_visited/check_visited/push_block/has_next/pop.
+template <typename EntityType, typename HeapType>
+void fast_search_neighbors(const EntityType &entity, HeapType &pool,
+                           VisitFilter &visit, HnswDistCalculator &dc,
+                           uint32_t topk, uint32_t ef, node_id_t entry_point,
+                           dist_t entry_dist, uint32_t prefetch_lines) {
+  const uint32_t max_deg = entity.max_degree(0);  // level 0 only
+  const uint32_t cap = std::max(topk, ef);
+  pool.reset(static_cast<int32_t>(cap), static_cast<int32_t>(max_deg));
+  visit.clear();
+
+  visit.set_visited(entry_point);
+  pool.push_block(&entry_dist, &entry_point, 1);
+
+  static constexpr uint32_t GRAPH_PO = 8;
+
+  uint32_t buf_capacity = max_deg;
+  std::vector<node_id_t> neighbor_ids(buf_capacity);
+  std::vector<float> dists(buf_capacity);
+  std::vector<const void *> neighbor_vecs(buf_capacity);
+
+  while (pool.has_next()) {
+    auto current_node = pool.pop();
+
+    const auto neighbors = entity.get_neighbors_typed(0, current_node);
+    ailego_prefetch(neighbors.data);
+
+    if (neighbors.size() > buf_capacity) {
+      buf_capacity = neighbors.size();
+      neighbor_ids.resize(buf_capacity);
+      dists.resize(buf_capacity);
+      neighbor_vecs.resize(buf_capacity);
+    }
+
+    const uint32_t po =
+        std::min(static_cast<uint32_t>(neighbors.size()), GRAPH_PO);
+    uint32_t unvisited_count = 0;
+    uint32_t i = 0;
+
+    // Phase 1: scan first `po` neighbors with prefetch.
+    for (; i < po; ++i) {
+      node_id_t node = neighbors[i];
+      if (visit.visited(node)) continue;
+      visit.set_visited(node);
+      const void *vec_ptr = entity.get_vector_ptr(node);
+      const char *p = reinterpret_cast<const char *>(vec_ptr);
+      for (uint32_t cl = 0; cl < prefetch_lines; ++cl) {
+        ailego_prefetch(p + cl * 64);
+      }
+      neighbor_ids[unvisited_count] = node;
+      neighbor_vecs[unvisited_count] = vec_ptr;
+      unvisited_count++;
+    }
+
+    // Phase 2: scan remaining neighbors.
+    for (; i < neighbors.size(); ++i) {
+      node_id_t node = neighbors[i];
+      if (visit.visited(node)) continue;
+      visit.set_visited(node);
+      neighbor_ids[unvisited_count] = node;
+      neighbor_vecs[unvisited_count] = entity.get_vector_ptr(node);
+      unvisited_count++;
+    }
+
+    if (unvisited_count == 0) continue;
+    dc.batch_dist(neighbor_vecs.data(), unvisited_count, dists.data());
+
+    pool.push_block(dists.data(), neighbor_ids.data(),
+                    static_cast<int32_t>(unvisited_count));
+  }
+}
+
+// ============================================================================
+// dual_heap_search_neighbors: shared core for the fallback dual-heap path.
+//
+// Maintains a candidate min-heap + topk heap + VisitFilter.  Supports
+// arbitrary levels, filters, and MemoryBlock types (BufferPool/Mmap).
+// Also updates entry_point/dist for next-level continuation.
+// ============================================================================
+template <typename EntityType, typename MemBlockType, typename FilterFn>
+void dual_heap_search_neighbors(const EntityType &entity, level_t level,
+                                node_id_t *entry_point, dist_t *dist,
+                                TopkHeap &topk, HnswContext *ctx,
+                                HnswDistCalculator &dc, FilterFn &&filter) {
+  static constexpr uint32_t BATCH_SIZE = 12;
+  static constexpr uint32_t PREFETCH_STEP = 2;
+
+  uint32_t buf_capacity = entity.max_degree(level);
+  std::vector<node_id_t> neighbor_ids(buf_capacity);
+  std::vector<MemBlockType> neighbor_vec_blocks;
+  neighbor_vec_blocks.reserve(buf_capacity);
+  std::vector<float> dists(buf_capacity);
+  std::vector<const void *> neighbor_vecs(buf_capacity);
+
   VisitFilter &visit = ctx->visit_filter();
   CandidateHeap &candidates = ctx->candidates();
-  std::function<bool(node_id_t)> filter = [](node_id_t) { return false; };
-  if (ctx->filter().is_valid()) {
-    filter = [&](node_id_t id) {
-      return ctx->filter()(entity.get_key_typed(id));
-    };
-  }
 
   candidates.clear();
   visit.clear();
@@ -210,7 +307,14 @@ void HnswAlgorithm<EntityType>::search_neighbors(level_t level,
       (*ctx->mutable_stats_get_neighbors())++;
     }
 
-    std::vector<node_id_t> neighbor_ids(neighbors.size());
+    if (neighbors.size() > buf_capacity) {
+      buf_capacity = neighbors.size();
+      neighbor_ids.resize(buf_capacity);
+      neighbor_vec_blocks.resize(buf_capacity);
+      dists.resize(buf_capacity);
+      neighbor_vecs.resize(buf_capacity);
+    }
+
     uint32_t size = 0;
     for (uint32_t i = 0; i < neighbors.size(); ++i) {
       node_id_t node = neighbors[i];
@@ -227,7 +331,7 @@ void HnswAlgorithm<EntityType>::search_neighbors(level_t level,
       continue;
     }
 
-    std::vector<MemBlockType> neighbor_vec_blocks;
+    neighbor_vec_blocks.clear();
     int ret =
         entity.get_vector_typed(neighbor_ids.data(), size, neighbor_vec_blocks);
     if (ailego_unlikely(ctx->debugging())) {
@@ -238,15 +342,9 @@ void HnswAlgorithm<EntityType>::search_neighbors(level_t level,
     }
 
     // do prefetch
-    static constexpr node_id_t BATCH_SIZE = 12;
-    static constexpr node_id_t PREFETCH_STEP = 2;
     for (uint32_t i = 0; i < std::min(BATCH_SIZE * PREFETCH_STEP, size); ++i) {
       ailego_prefetch(neighbor_vec_blocks[i].data());
     }
-    // done
-
-    std::vector<float> dists(size);
-    std::vector<const void *> neighbor_vecs(size);
 
     for (uint32_t i = 0; i < size; ++i) {
       neighbor_vecs[i] = neighbor_vec_blocks[i].data();
@@ -268,11 +366,77 @@ void HnswAlgorithm<EntityType>::search_neighbors(level_t level,
         if (!filter(node)) {
           topk.emplace(node, cur_dist);
         }
-      }  // end if
-    }  // end for
-  }  // while
+      }
+    }
+  }
+}
 
-  return;
+// ============================================================================
+// search_neighbors: Dispatch to fast or dual-heap path.
+//
+// - add_node / filtered / upper levels  →  dual_heap_search_neighbors
+// - level-0 unfiltered search:
+//     MmapMemoryBlock  →  fast_search_neighbors (BlockHeap/LinearPool)
+//     BufferPool       →  dual_heap_search_neighbors (fallback)
+// ============================================================================
+template <typename EntityType>
+void HnswAlgorithm<EntityType>::search_neighbors(level_t level,
+                                                 node_id_t *entry_point,
+                                                 dist_t *dist, TopkHeap &topk,
+                                                 HnswContext *ctx,
+                                                 bool use_pool) const {
+  const auto &entity = static_cast<const EntityType &>(ctx->get_entity());
+  HnswDistCalculator &dc = ctx->dist_calculator();
+
+  const uint32_t prefetch_lines = (entity.vector_size() + 63) / 64;
+
+  if (!use_pool || ctx->filter().is_valid() || level != 0) {
+    // Dual-heap path: add_node, filtered search, or upper-level scan.
+    auto run_with_filter = [&](auto &&filter) {
+      dual_heap_search_neighbors<EntityType, MemBlockType>(
+          entity, level, entry_point, dist, topk, ctx, dc,
+          std::forward<decltype(filter)>(filter));
+    };
+
+    if (ctx->filter().is_valid()) {
+      auto filter = [&](node_id_t id) {
+        return ctx->filter()(entity.get_key_typed(id));
+      };
+      run_with_filter(filter);
+    } else {
+      auto filter = [](node_id_t) { return false; };
+      run_with_filter(filter);
+    }
+  } else {
+    // Pool-based path for level-0 unfiltered search.
+    if constexpr (std::is_same_v<MemBlockType, MmapMemoryBlock>) {
+      // Fast path: direct pointer access via get_vector_ptr.
+      // BlockHeap (AVX2) or LinearPool (scalar) for top-k tracking.
+      const uint32_t topk_v = static_cast<uint32_t>(ctx->topk());
+      const uint32_t ef_v = ctx->ef();
+      const bool avx2_ok =
+          zvec::ailego::internal::CpuFeatures::static_flags_.AVX2;
+
+      auto &visit = ctx->visit_filter();
+
+      if (avx2_ok) {
+        auto &bpool = ctx->block_pool();
+        fast_search_neighbors(entity, bpool, visit, dc, topk_v, ef_v,
+                              *entry_point, *dist, prefetch_lines);
+        copy_pool_to_topk(bpool, topk);
+      } else {
+        auto &lpool = ctx->pool();
+        fast_search_neighbors(entity, lpool, visit, dc, topk_v, ef_v,
+                              *entry_point, *dist, prefetch_lines);
+        copy_pool_to_topk(lpool, topk);
+      }
+    } else {
+      // BufferPool entities: fallback to dual-heap path.
+      auto filter = [](node_id_t) { return false; };
+      dual_heap_search_neighbors<EntityType, MemBlockType>(
+          entity, level, entry_point, dist, topk, ctx, dc, filter);
+    }
+  }
 }
 
 template <typename EntityType>
@@ -390,9 +554,9 @@ void HnswAlgorithm<EntityType>::expand_neighbors_by_group(
         }
 
         candidates.emplace(node, cur_dist);
-      }  // end for
-    }  // end while
-  }  // end if
+      }
+    }
+  }
 }
 
 template <typename EntityType>
diff --git a/src/core/algorithm/hnsw/hnsw_algorithm.h b/src/core/algorithm/hnsw/hnsw_algorithm.h
index ea73cd708..7851b1c52 100644
--- a/src/core/algorithm/hnsw/hnsw_algorithm.h
+++ b/src/core/algorithm/hnsw/hnsw_algorithm.h
@@ -111,11 +111,14 @@ class HnswAlgorithm : public HnswAlgorithmBase {
   void add_neighbors(node_id_t id, level_t level, TopkHeap &topk_heap,
                      HnswContext *ctx);
 
-  //! Given a node id and level, search the nearest neighbors in graph
-  //! Note: the nearest neighbors result keeps in topk, and entry_point and
-  //! dist will be updated to current level nearest node id and distance
+  //! Given a node id and level, search the nearest neighbors in graph.
+  //! Dispatches to fast_search_neighbors (pool-based, direct pointer) for
+  //! mmap/contiguous level-0 unfiltered search, or dual_heap_search_neighbors
+  //! (CandidateHeap + TopkHeap) for add_node, filtered search, upper levels,
+  //! and BufferPool fallback.
+  //! Note: entry_point and dist will be updated to current level nearest node.
   void search_neighbors(level_t level, node_id_t *entry_point, dist_t *dist,
-                        TopkHeap &topk, HnswContext *ctx) const;
+                        TopkHeap &topk, HnswContext *ctx, bool use_pool) const;
 
   //! Update the node's neighbors
   void update_neighbors(HnswDistCalculator &dc, node_id_t id, level_t level,
diff --git a/src/core/algorithm/hnsw/hnsw_context.h b/src/core/algorithm/hnsw/hnsw_context.h
index ca2c09438..4a0e86854 100644
--- a/src/core/algorithm/hnsw/hnsw_context.h
+++ b/src/core/algorithm/hnsw/hnsw_context.h
@@ -14,6 +14,8 @@
 #pragma once
 
 #include <zvec/core/framework/index_context.h>
+#include "utility/block_heap.h"
+#include "utility/linear_pool.h"
 #include "utility/sparse_utility.h"
 #include "utility/visit_filter.h"
 #include "hnsw_dist_calculator.h"
@@ -275,6 +277,15 @@ class HnswContext : public IndexContext {
     return update_heap_;
   }
 
+  inline LinearPool<dist_t> &pool() {
+    return pool_;
+  }
+
+  // Only accessed under a runtime CpuFeatures::AVX2 guard at call sites.
+  inline BlockHeap &block_pool() {
+    return block_pool_;
+  }
+
   inline VisitFilter &visit_filter() {
     return visit_filter_;
   }
@@ -299,6 +310,10 @@ class HnswContext : public IndexContext {
     ef_ = v;
   }
 
+  inline uint32_t ef(void) const {
+    return ef_;
+  }
+
   inline void set_filter_mode(uint32_t v) {
     filter_mode_ = v;
   }
@@ -530,6 +545,9 @@ class HnswContext : public IndexContext {
   uint32_t stats_get_vector_cnt_{0u};
   uint32_t stats_visit_dup_cnt_{0u};
   std::string preprocess_buffer_;
+
+  LinearPool<dist_t> pool_;
+  BlockHeap block_pool_;
 };
 
 }  // namespace core
diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
index b3c2903dc..50f15c3ff 100644
--- a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
+++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
@@ -844,8 +844,11 @@ const HnswEntity::Pointer HnswContiguousStreamerEntity::clone() const {
   }
 
   // Share contiguous memory with the clone (zero-copy)
-  entity->node_memory_ = node_memory_;
-  entity->node_base_ = node_base_;
+  entity->vector_memory_ = vector_memory_;
+  entity->vector_base_ = vector_base_;
+  entity->graph_memory_ = graph_memory_;
+  entity->graph_base_ = graph_base_;
+  entity->graph_stride_ = graph_stride_;
   entity->upper_neighbor_memory_ = upper_neighbor_memory_;
   entity->upper_neighbor_base_ = upper_neighbor_base_;
   entity->upper_chunk_offsets_ = upper_chunk_offsets_;
@@ -900,8 +903,10 @@ char *HnswContiguousStreamerEntity::allocate_contiguous(size_t size) {
 }
 
 int HnswContiguousStreamerEntity::build_contiguous_memory() {
-  node_memory_.reset();
-  node_base_ = nullptr;
+  vector_memory_.reset();
+  vector_base_ = nullptr;
+  graph_memory_.reset();
+  graph_base_ = nullptr;
   upper_neighbor_memory_.reset();
   upper_neighbor_base_ = nullptr;
   upper_chunk_offsets_.clear();
@@ -911,20 +916,36 @@ int HnswContiguousStreamerEntity::build_contiguous_memory() {
     return 0;
   }
 
-  // --- Build contiguous node memory ---
   const size_t per_node = node_size();
-  const size_t total_node_data = static_cast<size_t>(total_docs) * per_node;
-  size_t node_memory_size = AlignHugePageSize(total_node_data);
-  char *raw_node = allocate_contiguous(node_memory_size);
-  if (!raw_node) {
+  const size_t vec_size = vector_size();
+  // graph_stride = key + L0 neighbors (everything except vector)
+  graph_stride_ = sizeof(key_t) + neighbor_size_;
+
+  // --- Allocate flat vector array (stride = vector_size) ---
+  const size_t total_vec_data = static_cast<size_t>(total_docs) * vec_size;
+  size_t vector_memory_size = AlignHugePageSize(total_vec_data);
+  char *raw_vec = allocate_contiguous(vector_memory_size);
+  if (!raw_vec) {
     return IndexError_Runtime;
   }
-  node_memory_.reset(raw_node, ContiguousDeleter{node_memory_size});
-  node_base_ = raw_node;
+  vector_memory_.reset(raw_vec, ContiguousDeleter{vector_memory_size});
+  vector_base_ = raw_vec;
+
+  // --- Allocate graph array (stride = sizeof(key_t) + neighbor_size) ---
+  const size_t total_graph_data =
+      static_cast<size_t>(total_docs) * graph_stride_;
+  size_t graph_memory_size = AlignHugePageSize(total_graph_data);
+  char *raw_graph = allocate_contiguous(graph_memory_size);
+  if (!raw_graph) {
+    vector_memory_.reset();
+    vector_base_ = nullptr;
+    return IndexError_Runtime;
+  }
+  graph_memory_.reset(raw_graph, ContiguousDeleter{graph_memory_size});
+  graph_base_ = raw_graph;
 
-  // Copy node data from chunks into contiguous memory
-  // Each chunk holds node_cnt_per_chunk nodes, laid out at offset
-  // (id & mask) * node_size within the chunk.
+  // Split node data from chunks into vector and graph arrays.
+  // Original node layout: [vector (vec_size) | key (8B) | L0 neighbors]
   const auto &chunks = node_chunks_;
   const uint32_t nodes_per_chunk = 1U << node_index_mask_bits_;
   for (size_t chunk_idx = 0; chunk_idx < chunks.size(); ++chunk_idx) {
@@ -932,19 +953,30 @@ int HnswContiguousStreamerEntity::build_contiguous_memory() {
     size_t data_size = chunks[chunk_idx]->data_size();
     chunks[chunk_idx]->read(0, &chunk_data, data_size);
 
-    // Number of nodes in this chunk
     uint32_t base_id = chunk_idx * nodes_per_chunk;
     uint32_t count_in_chunk = std::min(nodes_per_chunk, total_docs - base_id);
 
-    // Copy each node's data
     const char *src = static_cast<const char *>(chunk_data);
-    char *dst = node_base_ + static_cast<size_t>(base_id) * per_node;
-    std::memcpy(dst, src, static_cast<size_t>(count_in_chunk) * per_node);
+    for (uint32_t i = 0; i < count_in_chunk; ++i) {
+      const char *node_src = src + static_cast<size_t>(i) * per_node;
+      size_t global_id = static_cast<size_t>(base_id + i);
+
+      // Copy vector to flat vector array
+      std::memcpy(vector_base_ + global_id * vec_size, node_src, vec_size);
+
+      // Copy key + L0 neighbors to graph array
+      std::memcpy(graph_base_ + global_id * graph_stride_, node_src + vec_size,
+                  graph_stride_);
+    }
   }
 
   // --- Build contiguous upper neighbor memory ---
   const auto &upper_chunks = upper_neighbor_chunks_;
   if (upper_chunks.empty()) {
+    LOG_INFO(
+        "Built HNSW contiguous memory (split layout): "
+        "vector_mem=%zu graph_mem=%zu total_docs=%u node_chunks=%zu",
+        vector_memory_size, graph_memory_size, total_docs, chunks.size());
     return 0;
   }
 
@@ -962,8 +994,10 @@ int HnswContiguousStreamerEntity::build_contiguous_memory() {
   size_t upper_memory_size = AlignHugePageSize(total_upper_size);
   char *raw_upper = allocate_contiguous(upper_memory_size);
   if (!raw_upper) {
-    node_memory_.reset();
-    node_base_ = nullptr;
+    vector_memory_.reset();
+    vector_base_ = nullptr;
+    graph_memory_.reset();
+    graph_base_ = nullptr;
     return IndexError_Runtime;
   }
   upper_neighbor_memory_.reset(raw_upper, ContiguousDeleter{upper_memory_size});
@@ -979,10 +1013,11 @@ int HnswContiguousStreamerEntity::build_contiguous_memory() {
   }
 
   LOG_INFO(
-      "Built contiguous memory: node_size=%zu upper_neighbor_size=%zu "
+      "Built HNSW contiguous memory (split layout): "
+      "vector_mem=%zu graph_mem=%zu upper_neighbor_mem=%zu "
       "total_docs=%u node_chunks=%zu upper_chunks=%zu",
-      node_memory_size, upper_memory_size, total_docs, chunks.size(),
-      upper_chunks.size());
+      vector_memory_size, graph_memory_size, upper_memory_size, total_docs,
+      chunks.size(), upper_chunks.size());
 
   return 0;
 }
diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h
index 4e7566171..013ad8a6f 100644
--- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h
+++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h
@@ -217,6 +217,10 @@ class HnswStreamerEntity : public HnswEntity {
     return sizeof(NeighborsHeader) + upper_neighbor_cnt() * sizeof(node_id_t);
   }
 
+  inline size_t max_degree(level_t level) const {
+    return level == 0 ? neighbor_size_ : upper_neighbor_size_;
+  }
+
 
  protected:
   union UpperNeighborIndexMeta {
@@ -808,7 +812,8 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity {
   //! static_cast<const HnswMmapStreamerEntity&> in the algorithm is safe.
   const HnswEntity::Pointer clone() const override;
 
-  inline TypedNeighbors get_neighbors_typed(level_t level, node_id_t id) const {
+  ailego_force_inline TypedNeighbors get_neighbors_typed(level_t level,
+                                                         node_id_t id) const {
     if (level == 0UL) {
       uint32_t chunk_idx = id >> node_index_mask_bits_;
       uint32_t offset =
@@ -831,8 +836,9 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity {
     return TypedNeighbors(std::move(block));
   }
 
-  inline int get_vector_typed(const node_id_t *ids, uint32_t count,
-                              std::vector<MmapMemoryBlock> &vec_blocks) const {
+  ailego_force_inline int get_vector_typed(
+      const node_id_t *ids, uint32_t count,
+      std::vector<MmapMemoryBlock> &vec_blocks) const {
     vec_blocks.resize(count);
     for (auto i = 0U; i < count; ++i) {
       uint32_t chunk_idx = ids[i] >> node_index_mask_bits_;
@@ -843,7 +849,7 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity {
     return 0;
   }
 
-  inline key_t get_key_typed(node_id_t id) const {
+  ailego_force_inline key_t get_key_typed(node_id_t id) const {
     if (!use_key_info_map_) {
       return id;
     }
@@ -853,9 +859,18 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity {
     return *reinterpret_cast<const key_t *>(base + offset);
   }
 
- private:
+  //! Direct vector pointer access (no MemoryBlock wrapper).
+  //! For use in the merged search loop to avoid intermediate allocations.
+  ailego_force_inline const void *get_vector_ptr(node_id_t id) const {
+    uint32_t chunk_idx = id >> node_index_mask_bits_;
+    uint32_t offset = (id & node_index_mask_) * node_size();
+    return get_node_chunk_base(chunk_idx) + offset;
+  }
+
+ protected:
   //! Get cached base address for a node chunk, syncing if needed
-  inline const char *get_node_chunk_base(uint32_t chunk_idx) const {
+  ailego_force_inline const char *get_node_chunk_base(
+      uint32_t chunk_idx) const {
     if (ailego_unlikely(chunk_idx >= node_chunk_bases_.size())) {
       sync_node_chunk_bases(chunk_idx);
     }
@@ -863,7 +878,8 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity {
   }
 
   //! Get cached base address for an upper neighbor chunk, syncing if needed
-  inline const char *get_upper_neighbor_chunk_base(uint32_t chunk_idx) const {
+  ailego_force_inline const char *get_upper_neighbor_chunk_base(
+      uint32_t chunk_idx) const {
     if (ailego_unlikely(chunk_idx >= upper_neighbor_chunk_bases_.size())) {
       sync_upper_neighbor_chunk_bases(chunk_idx);
     }
@@ -926,9 +942,11 @@ class HnswBufferPoolStreamerEntity : public HnswStreamerEntity {
 };
 
 //! Typed entity subclass for contiguous memory mode.
-//! Allocates contiguous memory (with hugepage/THP support) and copies all
-//! chunk data into it. Access is via a single base pointer + offset,
-//! eliminating chunk-level indirection and maximizing memory locality.
+//! Splits node data into two dense arrays during build:
+//!   1. vector_base_: flat vector array (stride = vector_size)
+//!   2. graph_base_:  key + L0 neighbors  (stride = graph_stride_)
+//! Total memory = vector_size + graph_stride_ per node (same as original
+//! node_size), but each access pattern gets optimal cache locality.
 class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity {
  public:
   using HnswMmapStreamerEntity::HnswMmapStreamerEntity;
@@ -950,8 +968,10 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity {
   //! Degrade to mmap mode by releasing contiguous memory and falling back
   //! to chunk-based access.
   void degrade_to_mmap() {
-    node_memory_.reset();
-    node_base_ = nullptr;
+    vector_memory_.reset();
+    vector_base_ = nullptr;
+    graph_memory_.reset();
+    graph_base_ = nullptr;
     upper_neighbor_memory_.reset();
     upper_neighbor_base_ = nullptr;
     upper_chunk_offsets_.clear();
@@ -959,7 +979,7 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity {
   }
 
   bool is_contiguous() const {
-    return node_base_ != nullptr;
+    return vector_base_ != nullptr;
   }
 
   int add_vector(level_t level, key_t key, const void *vec,
@@ -974,11 +994,14 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity {
     return HnswMmapStreamerEntity::add_vector_with_id(level, id, vec);
   }
 
-  inline TypedNeighbors get_neighbors_typed(level_t level, node_id_t id) const {
-    if (ailego_likely(node_base_ != nullptr)) {
+  ailego_force_inline TypedNeighbors get_neighbors_typed(level_t level,
+                                                         node_id_t id) const {
+    if (ailego_likely(graph_base_ != nullptr)) {
       if (level == 0UL) {
-        const char *ptr = node_base_ + static_cast<size_t>(id) * node_size() +
-                          vector_size() + sizeof(key_t);
+        // graph layout: [key (sizeof(key_t)) | NeighborsHeader + neighbors]
+        const char *ptr = graph_base_ +
+                          static_cast<size_t>(id) * graph_stride_ +
+                          sizeof(key_t);
         MmapMemoryBlock block(const_cast<char *>(ptr));
         return TypedNeighbors(std::move(block));
       }
@@ -999,13 +1022,14 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity {
     return HnswMmapStreamerEntity::get_neighbors_typed(level, id);
   }
 
-  inline int get_vector_typed(const node_id_t *ids, uint32_t count,
-                              std::vector<MmapMemoryBlock> &vec_blocks) const {
-    if (ailego_likely(node_base_ != nullptr)) {
+  ailego_force_inline int get_vector_typed(
+      const node_id_t *ids, uint32_t count,
+      std::vector<MmapMemoryBlock> &vec_blocks) const {
+    if (ailego_likely(vector_base_ != nullptr)) {
       vec_blocks.resize(count);
       for (auto i = 0U; i < count; ++i) {
         const char *ptr =
-            node_base_ + static_cast<size_t>(ids[i]) * node_size();
+            vector_base_ + static_cast<size_t>(ids[i]) * vector_size();
         vec_blocks[i].reset(const_cast<char *>(ptr));
       }
       return 0;
@@ -1013,18 +1037,29 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity {
     return HnswMmapStreamerEntity::get_vector_typed(ids, count, vec_blocks);
   }
 
-  inline key_t get_key_typed(node_id_t id) const {
-    if (ailego_likely(node_base_ != nullptr)) {
+  ailego_force_inline key_t get_key_typed(node_id_t id) const {
+    if (ailego_likely(graph_base_ != nullptr)) {
       if (!use_key_info_map_) {
         return id;
       }
-      const char *ptr =
-          node_base_ + static_cast<size_t>(id) * node_size() + vector_size();
+      const char *ptr = graph_base_ + static_cast<size_t>(id) * graph_stride_;
       return *reinterpret_cast<const key_t *>(ptr);
     }
     return HnswMmapStreamerEntity::get_key_typed(id);
   }
 
+  //! Direct vector pointer from flat vector array (stride = vector_size).
+  //! For use in the merged search loop to avoid intermediate allocations.
+  ailego_force_inline const void *get_vector_ptr(node_id_t id) const {
+    if (ailego_likely(vector_base_ != nullptr)) {
+      return vector_base_ + static_cast<size_t>(id) * vector_size();
+    }
+    // Fallback to mmap chunk-based access
+    uint32_t chunk_idx = id >> node_index_mask_bits_;
+    uint32_t offset = (id & node_index_mask_) * node_size();
+    return get_node_chunk_base(chunk_idx) + offset;
+  }
+
  protected:
   //! Custom deleter for contiguous memory (munmap / _aligned_free / free)
   //! Used by shared_ptr to properly release mmap'd memory.
@@ -1042,12 +1077,17 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity {
     }
   };
 
-  //! Shared ownership of contiguous memory (enables zero-copy clone)
-  std::shared_ptr<char> node_memory_{};
-  std::shared_ptr<char> upper_neighbor_memory_{};
+  //! Flat vector array: vectors stored densely (stride = vector_size).
+  std::shared_ptr<char> vector_memory_{};
+  char *vector_base_{nullptr};
+
+  //! Graph array: [key | L0 neighbors] stored densely (stride = graph_stride_).
+  std::shared_ptr<char> graph_memory_{};
+  char *graph_base_{nullptr};
+  size_t graph_stride_{0};  // sizeof(key_t) + neighbor_size_
 
-  //! Raw pointers for hot-path access (derived from shared_ptr)
-  char *node_base_{nullptr};
+  //! Shared ownership of upper neighbor contiguous memory
+  std::shared_ptr<char> upper_neighbor_memory_{};
   char *upper_neighbor_base_{nullptr};
 
   //! Cumulative offsets for each upper neighbor chunk in contiguous memory
diff --git a/src/core/algorithm/vamana/vamana_algorithm.cc b/src/core/algorithm/vamana/vamana_algorithm.cc
index a57d36bce..dfd4cc66e 100644
--- a/src/core/algorithm/vamana/vamana_algorithm.cc
+++ b/src/core/algorithm/vamana/vamana_algorithm.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "vamana_algorithm.h"
+#include <type_traits>
+#include <ailego/internal/cpu_features.h>
 
 namespace zvec {
 namespace core {
@@ -48,7 +50,8 @@ int VamanaAlgorithm<EntityType>::add_node(node_id_t id, VamanaContext *ctx) {
   ctx->topk_heap().limit(search_list_size);
   ctx->dist_calculator().clear_compare_cnt();
 
-  // Set query to the new node's vector
+  // Set query to the new node's vector. Use reset_query (same as search path)
+  // so that greedy_search works with the search-optimized distance kernel.
   const void *query_vec = entity_.get_vector(id);
   if (ailego_unlikely(query_vec == nullptr)) {
     LOG_ERROR("Failed to get vector for node %u", id);
@@ -56,10 +59,11 @@ int VamanaAlgorithm<EntityType>::add_node(node_id_t id, VamanaContext *ctx) {
   }
   ctx->reset_query(query_vec);
 
-  greedy_search(entry_point, ctx);
+  greedy_search(entry_point, ctx, /*use_pool=*/false);
 
-  // Step 2: RobustPrune to select diverse neighbors
   auto &topk_heap = ctx->topk_heap();
+
+  // Step 2: RobustPrune to select diverse neighbors
   robust_prune(id, topk_heap, entity_.alpha(), entity_.max_degree(), ctx);
   // Copy result before reverse updates (which also call robust_prune)
   auto pruned_neighbors = ctx->prune_result();
@@ -96,38 +100,119 @@ int VamanaAlgorithm<EntityType>::search(VamanaContext *ctx) const {
   uint32_t ef_search = std::max(static_cast<uint32_t>(ctx->topk()), ctx->ef());
   topk_heap.limit(ef_search);
 
-  greedy_search(entry_point, ctx);
+  greedy_search(entry_point, ctx, /*use_pool=*/true);
 
   return 0;
 }
 
 // ============================================================================
-// greedy_search: Beam search from entry_point.
+// greedy_search helper templates
 //
-// Maintains a candidate min-heap (ordered by distance) and a visited set.
-// At each step, pops the closest unvisited candidate, expands its neighbors,
-// and adds unvisited neighbors to both the candidate heap and the topk heap.
-// Stops when the closest candidate is farther than the worst in topk, or
-// when the scan limit is reached.
+// Two specialized inner loops, dispatched from greedy_search():
+//
+//   fast_greedy_search:       mmap/contiguous with direct vector pointers.
+//                             Uses batch_dist on a pointer array.
+//   slow_greedy_search:       BufferPool-backed storage: must fetch MemBlock
+//                             wrappers via get_vector_typed to pin pages.
+//
+// Both accept either BlockHeap or LinearPool as `HeapType` because the
+// two expose the same reset(n, ef, block_size) / push_block(dists, ids, n)
+// surface (LinearPool adapts via push_block and ignores the block_size hint).
 // ============================================================================
-template <typename EntityType>
-void VamanaAlgorithm<EntityType>::greedy_search(node_id_t entry_point,
-                                                VamanaContext *ctx) const {
-  const auto &entity = static_cast<const EntityType &>(ctx->get_entity());
-  VamanaDistCalculator &dc = ctx->dist_calculator();
-  VisitFilter &visit = ctx->visit_filter();
-  CandidateHeap &candidates = ctx->candidates();
-  auto &topk_heap = ctx->topk_heap();
 
-  const IndexFilter &index_filter =
-      static_cast<const IndexContext *>(ctx)->filter();
-  std::function<bool(node_id_t)> filter = [](node_id_t) { return false; };
-  if (index_filter.is_valid()) {
-    filter = [&](node_id_t id) {
-      return index_filter(entity.get_key_typed(id));
-    };
+// mmap/contiguous variant: resolve vectors via get_vector_ptr
+// and dispatch to the classic pointer-array batch_dist.
+template <typename EntityType, typename HeapType>
+void fast_greedy_search(const EntityType &entity, HeapType &pool,
+                        VisitFilter &visit, VamanaDistCalculator &dc,
+                        uint32_t topk, uint32_t ef, node_id_t entry_point,
+                        uint32_t prefetch_lines) {
+  const uint32_t max_deg = entity.max_degree();
+  const uint32_t cap = std::max(topk, ef);
+  pool.reset(static_cast<int32_t>(cap), static_cast<int32_t>(max_deg));
+  visit.clear();
+
+  dist_t ep_dist = dc.batch_dist(entry_point);
+  visit.set_visited(entry_point);
+  pool.push_block(&ep_dist, &entry_point, 1);
+
+  static constexpr uint32_t GRAPH_PO = 8;
+
+  uint32_t buf_capacity = max_deg;
+  std::vector<node_id_t> neighbor_ids(buf_capacity);
+  std::vector<float> dists(buf_capacity);
+  std::vector<const void *> neighbor_vecs(buf_capacity);
+
+  while (pool.has_next()) {
+    auto current_node = pool.pop();
+
+    const auto neighbors = entity.get_neighbors_typed(current_node);
+    ailego_prefetch(neighbors.data);
+
+    if (neighbors.size() > buf_capacity) {
+      buf_capacity = neighbors.size();
+      neighbor_ids.resize(buf_capacity);
+      dists.resize(buf_capacity);
+      neighbor_vecs.resize(buf_capacity);
+    }
+
+    const uint32_t po =
+        std::min(static_cast<uint32_t>(neighbors.size()), GRAPH_PO);
+    uint32_t unvisited_count = 0;
+    uint32_t i = 0;
+
+    for (; i < po; ++i) {
+      node_id_t node = neighbors[i];
+      if (visit.visited(node)) continue;
+      visit.set_visited(node);
+      const void *vec_ptr = entity.get_vector_ptr(node);
+      const char *p = reinterpret_cast<const char *>(vec_ptr);
+      for (uint32_t cl = 0; cl < prefetch_lines; ++cl) {
+        ailego_prefetch(p + cl * 64);
+      }
+      neighbor_ids[unvisited_count] = node;
+      neighbor_vecs[unvisited_count] = vec_ptr;
+      unvisited_count++;
+    }
+    for (; i < neighbors.size(); ++i) {
+      node_id_t node = neighbors[i];
+      if (visit.visited(node)) continue;
+      visit.set_visited(node);
+      neighbor_ids[unvisited_count] = node;
+      neighbor_vecs[unvisited_count] = entity.get_vector_ptr(node);
+      unvisited_count++;
+    }
+
+    if (unvisited_count == 0) continue;
+    dc.batch_dist(neighbor_vecs.data(), unvisited_count, dists.data());
+    pool.push_block(dists.data(), neighbor_ids.data(),
+                    static_cast<int32_t>(unvisited_count));
   }
+}
+
+// ============================================================================
+// dual_heap_greedy_search: shared core for the fallback dual-heap path.
+//
+// Maintains a candidate min-heap + topk heap + VisitFilter.  Uses plain
+// batch_dist.
+// ============================================================================
+template <typename EntityType, typename MemBlockType, typename FilterFn>
+void dual_heap_greedy_search(const EntityType &entity, VamanaContext *ctx,
+                             VamanaDistCalculator &dc, node_id_t entry_point,
+                             FilterFn &&filter) {
+  static constexpr uint32_t PREFETCH_BATCH = 2;
+  static constexpr uint32_t PREFETCH_STEP = 2;
+
+  uint32_t buf_capacity = entity.max_degree();
+  std::vector<node_id_t> neighbor_ids(buf_capacity);
+  std::vector<MemBlockType> neighbor_vec_blocks;
+  neighbor_vec_blocks.reserve(buf_capacity);
+  std::vector<float> dists(buf_capacity);
+  std::vector<const void *> neighbor_vecs(buf_capacity);
 
+  VisitFilter &visit = ctx->visit_filter();
+  CandidateHeap &candidates = ctx->candidates();
+  auto &topk_heap = ctx->topk_heap();
   candidates.clear();
   visit.clear();
 
@@ -147,16 +232,6 @@ void VamanaAlgorithm<EntityType>::greedy_search(node_id_t entry_point,
   }
   candidates.emplace(entry_point, entry_dist);
 
-  // Pre-allocate temporary vectors outside the hot loop to avoid
-  // per-iteration heap allocations. Sized to max_degree initially;
-  // resized inside the loop if actual neighbor count exceeds this.
-  uint32_t buf_capacity = entity.max_degree();
-  std::vector<node_id_t> neighbor_ids(buf_capacity);
-  std::vector<MemBlockType> neighbor_vec_blocks;
-  neighbor_vec_blocks.reserve(buf_capacity);
-  std::vector<float> dists(buf_capacity);
-  std::vector<const void *> neighbor_vecs(buf_capacity);
-
   while (!candidates.empty() && !ctx->reach_scan_limit()) {
     auto top = candidates.begin();
     node_id_t current_node = top->first;
@@ -198,28 +273,17 @@ void VamanaAlgorithm<EntityType>::greedy_search(node_id_t entry_point,
                                       neighbor_vec_blocks);
     if (ailego_unlikely(ret != 0)) break;
 
-    // Prefetch for better cache performance
-    static constexpr uint32_t PREFETCH_BATCH = 2;
-    static constexpr uint32_t PREFETCH_STEP = 2;
     for (uint32_t i = 0;
          i < std::min(PREFETCH_BATCH * PREFETCH_STEP, unvisited_count); ++i) {
       ailego_prefetch(neighbor_vec_blocks[i].data());
     }
 
-    // Batch distance computation (reuse pre-allocated buffers)
+    // Batch distance computation (reuse pre-allocated buffers).
     for (uint32_t i = 0; i < unvisited_count; ++i) {
       neighbor_vecs[i] = neighbor_vec_blocks[i].data();
     }
     dc.batch_dist(neighbor_vecs.data(), unvisited_count, dists.data());
 
-    // Update candidates and topk.
-    // Unlike vanilla DiskANN which inserts all unvisited neighbors into
-    // the candidate queue unconditionally, we apply an early-pruning
-    // optimization: a neighbor is only inserted into the candidate queue
-    // (and topk_heap) if it could potentially improve the final results,
-    // i.e. either the topk heap is not yet full, or the neighbor is closer
-    // than the current worst result. This avoids expanding clearly
-    // unpromising branches and reduces the candidate queue size.
     for (uint32_t i = 0; i < unvisited_count; ++i) {
       node_id_t node = neighbor_ids[i];
       dist_t node_dist = dists[i];
@@ -233,6 +297,88 @@ void VamanaAlgorithm<EntityType>::greedy_search(node_id_t entry_point,
   }
 }
 
+// ============================================================================
+// greedy_search: Beam search from entry_point.
+//
+// Maintains a candidate min-heap (ordered by distance) and a visited set.
+// At each step, pops the closest unvisited candidate, expands its neighbors,
+// and adds unvisited neighbors to both the candidate heap and the topk heap.
+// Stops when the closest candidate is farther than the worst in topk, or
+// when the scan limit is reached.
+// ============================================================================
+template <typename EntityType>
+void VamanaAlgorithm<EntityType>::greedy_search(node_id_t entry_point,
+                                                VamanaContext *ctx,
+                                                bool use_pool) const {
+  const auto &entity = static_cast<const EntityType &>(ctx->get_entity());
+  VamanaDistCalculator &dc = ctx->dist_calculator();
+
+  const IndexFilter &index_filter =
+      static_cast<const IndexContext *>(ctx)->filter();
+
+  // Number of cache lines per vector (e.g. 2 for dim=128).
+  // Used by both the fallback candidates/filter path and the fast helpers.
+  uint32_t prefetch_lines = (dc.dimension() + 63) / 64;
+  if constexpr (std::is_same_v<EntityType, VamanaContiguousStreamerEntity>) {
+    // Contiguous flat array stride is already 64B-aligned.  Use it so that
+    // prefetch does not overshoot into the next vector.
+    size_t stride = entity.vector_stride();
+    if (stride > 0) {
+      prefetch_lines = static_cast<uint32_t>(stride / 64);
+    }
+  }
+
+  if (!use_pool || index_filter.is_valid()) {
+    // Fallback path used by add_node (use_pool=false) and filtered search.
+    // Dispatched to dual_heap_greedy_search (plain batch_dist).
+    auto run_with_filter = [&](auto &&filter) {
+      dual_heap_greedy_search<EntityType, MemBlockType>(
+          entity, ctx, dc, entry_point, std::forward<decltype(filter)>(filter));
+    };
+
+    if (index_filter.is_valid()) {
+      auto filter = [&](node_id_t id) {
+        return index_filter(entity.get_key_typed(id));
+      };
+      run_with_filter(filter);
+    } else {
+      auto filter = [](node_id_t) { return false; };
+      run_with_filter(filter);
+    }
+  } else {
+    // Fast pool-based path for mmap/contiguous entities that support
+    // direct pointer access. BlockHeap (AVX2) or LinearPool (scalar)
+    // are used for top-k tracking. BufferPool entities fall back to
+    // dual_heap_greedy_search since they lack direct pointer access.
+    if constexpr (std::is_same_v<MemBlockType, MmapMemoryBlock>) {
+      const uint32_t topk_v = static_cast<uint32_t>(ctx->topk());
+      const uint32_t ef_v = ctx->ef();
+      const bool avx2_ok =
+          zvec::ailego::internal::CpuFeatures::static_flags_.AVX2;
+      auto &topk_heap = ctx->topk_heap();
+
+      auto &visit = ctx->visit_filter();
+
+      if (avx2_ok) {
+        auto &bpool = ctx->block_pool();
+        fast_greedy_search(entity, bpool, visit, dc, topk_v, ef_v, entry_point,
+                           prefetch_lines);
+        copy_pool_to_topk(bpool, topk_heap);
+      } else {
+        auto &lpool = ctx->pool();
+        fast_greedy_search(entity, lpool, visit, dc, topk_v, ef_v, entry_point,
+                           prefetch_lines);
+        copy_pool_to_topk(lpool, topk_heap);
+      }
+    } else {
+      // BufferPool entities: fallback to dual-heap path.
+      auto filter = [](node_id_t) { return false; };
+      dual_heap_greedy_search<EntityType, MemBlockType>(entity, ctx, dc,
+                                                        entry_point, filter);
+    }
+  }
+}
+
 // ============================================================================
 // robust_prune: Select up to max_degree diverse neighbors from candidates.
 //
@@ -334,9 +480,11 @@ void VamanaAlgorithm<EntityType>::robust_prune(node_id_t id,
       }
 
       if (batch_count > 0) {
-        // Batch compute distances from selected candidate to remaining
-        dc.batch_dist_pair(selected_vec, batch_vecs.data(), batch_count,
-                           batch_dists.data());
+        // Compute distances from selected candidate to remaining candidates.
+        // distance_ is the symmetric data-to-data kernel (no pairwise split).
+        for (uint32_t k = 0; k < batch_count; ++k) {
+          batch_dists[k] = dc.dist(selected_vec, batch_vecs[k]);
+        }
 
         // DiskANN (L2/Cosine):
         //   occlude_factor[t] = max(occlude_factor[t], dist_to_query /
diff --git a/src/core/algorithm/vamana/vamana_algorithm.h b/src/core/algorithm/vamana/vamana_algorithm.h
index c5b2ce9e7..d8e0c9301 100644
--- a/src/core/algorithm/vamana/vamana_algorithm.h
+++ b/src/core/algorithm/vamana/vamana_algorithm.h
@@ -80,7 +80,8 @@ class VamanaAlgorithm : public VamanaAlgorithmBase {
   // GreedySearch: starting from entry_point, greedily expand the closest
   // unvisited candidate until the search list is exhausted or scan limit
   // is reached. Results accumulate in topk_heap.
-  void greedy_search(node_id_t entry_point, VamanaContext *ctx) const;
+  void greedy_search(node_id_t entry_point, VamanaContext *ctx,
+                     bool use_pool) const;
 
   // RobustPrune: given a candidate set (topk_heap), select up to max_degree
   // diverse neighbors using alpha-based distance comparison.
diff --git a/src/core/algorithm/vamana/vamana_context.h b/src/core/algorithm/vamana/vamana_context.h
index 38cc8457e..b3cacae31 100644
--- a/src/core/algorithm/vamana/vamana_context.h
+++ b/src/core/algorithm/vamana/vamana_context.h
@@ -14,6 +14,8 @@
 #pragma once
 
 #include <zvec/core/framework/index_context.h>
+#include "utility/block_heap.h"
+#include "utility/linear_pool.h"
 #include "utility/visit_filter.h"
 #include "vamana_dist_calculator.h"
 #include "vamana_entity.h"
@@ -119,6 +121,14 @@ class VamanaContext : public IndexContext {
   inline TopkHeap &update_heap() {
     return update_heap_;
   }
+  inline LinearPool<dist_t> &pool() {
+    return pool_;
+  }
+  // Block-insert pool used by the AVX2-gated greedy_search fast path.
+  // Only accessed under a runtime CpuFeatures::AVX2 guard at call sites.
+  inline BlockHeap &block_pool() {
+    return block_pool_;
+  }
   inline VisitFilter &visit_filter() {
     return visit_filter_;
   }
@@ -256,12 +266,6 @@ class VamanaContext : public IndexContext {
     return topk_;
   }
 
-  inline void update_dist_caculator_distance(
-      const IndexMetric::MatrixDistance &distance,
-      const IndexMetric::MatrixBatchDistance &batch_distance) {
-    dc_.update_distance(distance, batch_distance);
-  }
-
  private:
   void fill_random_to_topk_full(void);
 
@@ -320,6 +324,9 @@ class VamanaContext : public IndexContext {
 
   VisitFilter::Mode filter_mode_{VisitFilter::ByteMap};
   float filter_negative_prob_{VamanaEntity::kDefaultBFNegativeProbability};
+
+  LinearPool<dist_t> pool_;
+  BlockHeap block_pool_;
 };
 
 }  // namespace core
diff --git a/src/core/algorithm/vamana/vamana_dist_calculator.h b/src/core/algorithm/vamana/vamana_dist_calculator.h
index c78f63810..ed15b0bdf 100644
--- a/src/core/algorithm/vamana/vamana_dist_calculator.h
+++ b/src/core/algorithm/vamana/vamana_dist_calculator.h
@@ -65,13 +65,6 @@ class VamanaDistCalculator {
     dim_ = dim;
   }
 
-  inline void update_distance(
-      const IndexMetric::MatrixDistance &distance,
-      const IndexMetric::MatrixBatchDistance &batch_distance) {
-    distance_ = distance;
-    batch_distance_ = batch_distance;
-  }
-
   inline void reset_query(const void *query) {
     error_ = false;
     query_ = query;
@@ -136,22 +129,6 @@ class VamanaDistCalculator {
     return score;
   }
 
-  // Batch distance computation between a base vector and multiple target
-  // vectors. Does NOT use query_ and does NOT increment compare_cnt. Used for
-  // inter-candidate distance computation in robust_prune.
-  //
-  // Uses the single distance function (distance_) in a loop rather than
-  // batch_distance_, because batch_distance_ (turbo AVX512-VNNI) expects
-  // the second argument to be a preprocessed uint8 query (+128 shift),
-  // while base_vec here is a raw int8 stored vector. The single distance
-  // function (AVX2 sign/abs trick) correctly handles two raw int8 inputs.
-  inline void batch_dist_pair(const void *base_vec, const void **vecs,
-                              uint32_t count, float *dists) {
-    for (uint32_t i = 0; i < count; ++i) {
-      distance_(base_vec, vecs[i], dim_, &dists[i]);
-    }
-  }
-
   dist_t operator()(const void *vec) {
     return dist(vec);
   }
diff --git a/src/core/algorithm/vamana/vamana_streamer.cc b/src/core/algorithm/vamana/vamana_streamer.cc
index d03163c98..0c08a8238 100644
--- a/src/core/algorithm/vamana/vamana_streamer.cc
+++ b/src/core/algorithm/vamana/vamana_streamer.cc
@@ -235,6 +235,12 @@ int VamanaStreamer::open(IndexStorage::Pointer stg) {
     auto metric_params = index_meta.metric_params();
     metric_params.merge(meta_.metric_params());
     meta_.set_metric(index_meta.metric_name(), 0, metric_params);
+    // Propagate reformer info from stored meta (needed for quantizers
+    // whose reformer params are computed during training, e.g. UniformInt8)
+    if (!index_meta.reformer_name().empty()) {
+      meta_.set_reformer(index_meta.reformer_name(), 0,
+                         index_meta.reformer_params());
+    }
   }
 
   // Create metric
@@ -256,17 +262,6 @@ int VamanaStreamer::open(IndexStorage::Pointer stg) {
     return IndexError_InvalidArgument;
   }
 
-  add_distance_ = metric_->distance();
-  add_batch_distance_ = metric_->batch_distance();
-  search_distance_ = add_distance_;
-  search_batch_distance_ = add_batch_distance_;
-
-  if (metric_->query_metric() && metric_->query_metric()->distance() &&
-      metric_->query_metric()->batch_distance()) {
-    search_distance_ = metric_->query_metric()->distance();
-    search_batch_distance_ = metric_->query_metric()->batch_distance();
-  }
-
   // Create algorithm based on entity storage mode
   switch (entity_->storage_mode()) {
     case VamanaStorageMode::kBufferPool:
@@ -456,8 +451,6 @@ int VamanaStreamer::add_impl(uint64_t pkey, const void *query,
   AILEGO_DEFER([&]() { shared_mutex_.unlock_shared(); });
 
   ctx->clear();
-  ctx->update_dist_caculator_distance(add_distance_, add_batch_distance_);
-  ctx->reset_query(query);
   ctx->check_need_adjuct_ctx(entity_->doc_cnt());
 
   if (metric_->support_train()) {
@@ -529,8 +522,6 @@ int VamanaStreamer::add_with_id_impl(uint32_t id, const void *query,
   AILEGO_DEFER([&]() { shared_mutex_.unlock_shared(); });
 
   ctx->clear();
-  ctx->update_dist_caculator_distance(add_distance_, add_batch_distance_);
-  ctx->reset_query(query);
   ctx->check_need_adjuct_ctx(entity_->doc_cnt());
 
   if (metric_->support_train()) {
@@ -593,7 +584,6 @@ int VamanaStreamer::search_impl(const void *query, const IndexQueryMeta &qmeta,
   }
 
   ctx->clear();
-  ctx->update_dist_caculator_distance(search_distance_, search_batch_distance_);
   ctx->resize_results(count);
   ctx->check_need_adjuct_ctx(entity_->doc_cnt());
 
@@ -655,7 +645,6 @@ int VamanaStreamer::search_bf_impl(const void *query,
   }
 
   ctx->clear();
-  ctx->update_dist_caculator_distance(search_distance_, search_batch_distance_);
   ctx->resize_results(count);
 
   const auto &filter = static_cast<IndexContext *>(ctx)->filter();
@@ -697,7 +686,6 @@ int VamanaStreamer::search_bf_by_p_keys_impl(
   }
 
   ctx->clear();
-  ctx->update_dist_caculator_distance(search_distance_, search_batch_distance_);
   ctx->resize_results(count);
 
   auto &topk = ctx->topk_heap();
diff --git a/src/core/algorithm/vamana/vamana_streamer.h b/src/core/algorithm/vamana/vamana_streamer.h
index 3446c73ed..a1217c9ac 100644
--- a/src/core/algorithm/vamana/vamana_streamer.h
+++ b/src/core/algorithm/vamana/vamana_streamer.h
@@ -150,11 +150,6 @@ class VamanaStreamer : public IndexStreamer {
   IndexMeta meta_{};
   IndexMetric::Pointer metric_{};
 
-  IndexMetric::MatrixDistance add_distance_{};
-  IndexMetric::MatrixDistance search_distance_{};
-  IndexMetric::MatrixBatchDistance add_batch_distance_{};
-  IndexMetric::MatrixBatchDistance search_batch_distance_{};
-
   Stats stats_{};
   std::mutex mutex_{};
 
diff --git a/src/core/algorithm/vamana/vamana_streamer_entity.cc b/src/core/algorithm/vamana/vamana_streamer_entity.cc
index b9bb38145..4324ea2b8 100644
--- a/src/core/algorithm/vamana/vamana_streamer_entity.cc
+++ b/src/core/algorithm/vamana/vamana_streamer_entity.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "vamana_streamer_entity.h"
-#if defined(__linux__) || defined(__APPLE__)
-#include <sys/mman.h>
-#endif
 #include <ailego/utility/memory_helper.h>
 #include <zvec/ailego/hash/crc32c.h>
 #include <zvec/core/framework/index_stats.h>
@@ -540,8 +537,12 @@ const VamanaEntity::Pointer VamanaContiguousStreamerEntity::clone() const {
   }
 
   // Share contiguous memory with the clone (zero-copy)
-  entity->node_memory_ = node_memory_;
-  entity->node_base_ = node_base_;
+  entity->vector_memory_ = vector_memory_;
+  entity->vector_base_ = vector_base_;
+  entity->vector_stride_ = vector_stride_;
+  entity->graph_memory_ = graph_memory_;
+  entity->graph_base_ = graph_base_;
+  entity->graph_stride_ = graph_stride_;
 
   return VamanaEntity::Pointer(entity);
 }
@@ -552,56 +553,60 @@ const VamanaEntity::Pointer VamanaContiguousStreamerEntity::clone() const {
 
 char *VamanaContiguousStreamerEntity::allocate_contiguous(size_t size) {
   if (size == 0) return nullptr;
-#if defined(__linux__)
-  void *ptr = ::mmap(nullptr, size, PROT_READ | PROT_WRITE,
-                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-  if (ptr == MAP_FAILED) {
-    LOG_ERROR("mmap failed for contiguous memory, size=%zu", size);
-    return nullptr;
-  }
-  ::madvise(ptr, size, MADV_HUGEPAGE);
-  return static_cast<char *>(ptr);
-#elif defined(__APPLE__)
-  void *ptr = ::mmap(nullptr, size, PROT_READ | PROT_WRITE,
-                     MAP_PRIVATE | MAP_ANON, -1, 0);
-  if (ptr == MAP_FAILED) {
-    LOG_ERROR("mmap failed for contiguous memory, size=%zu", size);
-    return nullptr;
-  }
-  return static_cast<char *>(ptr);
-#elif defined(_WIN32)
-  void *ptr = ::_aligned_malloc(size, ailego::MemoryHelper::PageSize());
-  if (!ptr) {
-    LOG_ERROR("_aligned_malloc failed for contiguous memory, size=%zu", size);
-    return nullptr;
-  }
-  return static_cast<char *>(ptr);
-#else
-  void *ptr = std::aligned_alloc(ailego::MemoryHelper::PageSize(), size);
+  void *ptr = ailego::MemoryHelper::AllocateHugePage(size);
   if (!ptr) {
-    LOG_ERROR("aligned_alloc failed, size=%zu", size);
+    LOG_ERROR("AllocateHugePage failed for contiguous memory, size=%zu", size);
     return nullptr;
   }
   return static_cast<char *>(ptr);
-#endif
 }
 
 int VamanaContiguousStreamerEntity::build_contiguous_memory() {
-  node_memory_.reset();
-  node_base_ = nullptr;
+  vector_memory_.reset();
+  vector_base_ = nullptr;
+  vector_stride_ = 0;
+  graph_memory_.reset();
+  graph_base_ = nullptr;
 
   const uint32_t total_docs = doc_cnt();
   if (total_docs == 0) return 0;
 
   const size_t per_node = node_size();
-  const size_t total_node_data = static_cast<size_t>(total_docs) * per_node;
-  size_t node_memory_size = AlignHugePageSize(total_node_data);
-  char *raw_node = allocate_contiguous(node_memory_size);
-  if (!raw_node) return IndexError_Runtime;
-  node_memory_.reset(raw_node, ContiguousDeleter{node_memory_size});
-  node_base_ = raw_node;
-
-  // Copy node data from chunks into contiguous memory
+  const size_t vec_size = vector_size();
+
+  // Pad per-vector stride up to kVectorAlignment (64B) so every vector
+  // starts on a cache-line boundary.
+  vector_stride_ =
+      (vec_size + (kVectorAlignment - 1)) & ~(kVectorAlignment - 1);
+  // graph_stride = key + neighbors (everything except vector)
+  graph_stride_ = sizeof(key_t) + neighbors_size();
+
+  // Allocate flat vector array (stride = vector_stride_, padded for 64B)
+  const size_t total_vec_data =
+      static_cast<size_t>(total_docs) * vector_stride_;
+  size_t vector_memory_size = AlignHugePageSize(total_vec_data);
+  char *raw_vec = allocate_contiguous(vector_memory_size);
+  if (!raw_vec) return IndexError_Runtime;
+  vector_memory_.reset(raw_vec, ContiguousDeleter{vector_memory_size});
+  vector_base_ = raw_vec;
+
+  // Allocate graph array (stride = sizeof(key_t) + neighbors_size)
+  const size_t total_graph_data =
+      static_cast<size_t>(total_docs) * graph_stride_;
+  size_t graph_memory_size = AlignHugePageSize(total_graph_data);
+  char *raw_graph = allocate_contiguous(graph_memory_size);
+  if (!raw_graph) {
+    vector_memory_.reset();
+    vector_base_ = nullptr;
+    vector_stride_ = 0;
+    return IndexError_Runtime;
+  }
+  graph_memory_.reset(raw_graph, ContiguousDeleter{graph_memory_size});
+  graph_base_ = raw_graph;
+
+  // Split node data from chunks into vector / graph arrays.
+  // Original node layout: [vector (vec_size) | key (8B) | neighbors]
+  // Padding bytes in vector_base_ are left zero (anon mmap is zero-filled).
   const auto &chunks = node_chunks_;
   const uint32_t nodes_per_chunk = 1U << node_index_mask_bits_;
   for (size_t chunk_idx = 0; chunk_idx < chunks.size(); ++chunk_idx) {
@@ -613,14 +618,27 @@ int VamanaContiguousStreamerEntity::build_contiguous_memory() {
     uint32_t count_in_chunk = std::min(nodes_per_chunk, total_docs - base_id);
 
     const char *src = static_cast<const char *>(chunk_data);
-    char *dst = node_base_ + static_cast<size_t>(base_id) * per_node;
-    std::memcpy(dst, src, static_cast<size_t>(count_in_chunk) * per_node);
+    for (uint32_t i = 0; i < count_in_chunk; ++i) {
+      const char *node_src = src + static_cast<size_t>(i) * per_node;
+      size_t global_id = static_cast<size_t>(base_id + i);
+
+      // Copy vector to flat vector array at padded stride
+      std::memcpy(vector_base_ + global_id * vector_stride_, node_src,
+                  vec_size);
+
+      // Copy key + neighbors to graph array
+      std::memcpy(graph_base_ + global_id * graph_stride_, node_src + vec_size,
+                  graph_stride_);
+    }
   }
 
   LOG_INFO(
-      "Built Vamana contiguous memory: node_size=%zu total_docs=%u "
-      "node_chunks=%zu",
-      node_memory_size, total_docs, chunks.size());
+      "Built Vamana contiguous memory: "
+      "vector_mem=%zu graph_mem=%zu total_docs=%u "
+      "node_chunks=%zu vector_size=%zu vector_stride=%zu "
+      "(cache-line aligned to %zuB)",
+      vector_memory_size, graph_memory_size, total_docs, chunks.size(),
+      vec_size, vector_stride_, kVectorAlignment);
 
   return 0;
 }
diff --git a/src/core/algorithm/vamana/vamana_streamer_entity.h b/src/core/algorithm/vamana/vamana_streamer_entity.h
index 59652f2f0..ecc3f70b4 100644
--- a/src/core/algorithm/vamana/vamana_streamer_entity.h
+++ b/src/core/algorithm/vamana/vamana_streamer_entity.h
@@ -16,10 +16,8 @@
 #include <iostream>
 #include <memory>
 #include <mutex>
-#if defined(__linux__) || defined(__APPLE__)
-#include <sys/mman.h>
-#endif
 #include <ailego/parallel/lock.h>
+#include <ailego/utility/memory_helper.h>
 #include <sparsehash/dense_hash_map>
 #include <zvec/ailego/container/heap.h>
 #include <zvec/core/framework/index_framework.h>
@@ -478,7 +476,7 @@ class VamanaMmapStreamerEntity : public VamanaStreamerEntity {
   //! static_cast<const VamanaMmapStreamerEntity&> in the algorithm is safe.
   const VamanaEntity::Pointer clone() const override;
 
-  inline TypedNeighbors get_neighbors_typed(node_id_t id) const {
+  ailego_force_inline TypedNeighbors get_neighbors_typed(node_id_t id) const {
     uint32_t chunk_idx = id >> node_index_mask_bits_;
     uint32_t offset =
         (id & node_index_mask_) * node_size() + vector_size() + sizeof(key_t);
@@ -487,8 +485,9 @@ class VamanaMmapStreamerEntity : public VamanaStreamerEntity {
     return TypedNeighbors(std::move(block));
   }
 
-  inline int get_vector_typed(const node_id_t *ids, uint32_t count,
-                              std::vector<MmapMemoryBlock> &vec_blocks) const {
+  ailego_force_inline int get_vector_typed(
+      const node_id_t *ids, uint32_t count,
+      std::vector<MmapMemoryBlock> &vec_blocks) const {
     vec_blocks.resize(count);
     for (auto i = 0U; i < count; ++i) {
       uint32_t chunk_idx = ids[i] >> node_index_mask_bits_;
@@ -499,7 +498,7 @@ class VamanaMmapStreamerEntity : public VamanaStreamerEntity {
     return 0;
   }
 
-  inline key_t get_key_typed(node_id_t id) const {
+  ailego_force_inline key_t get_key_typed(node_id_t id) const {
     if (!use_key_info_map_) return id;
     uint32_t chunk_idx = id >> node_index_mask_bits_;
     uint32_t offset = (id & node_index_mask_) * node_size() + vector_size();
@@ -507,8 +506,17 @@ class VamanaMmapStreamerEntity : public VamanaStreamerEntity {
     return *reinterpret_cast<const key_t *>(base + offset);
   }
 
+  //! Direct vector pointer access (no MemoryBlock wrapper).
+  //! For use in the merged search loop to avoid intermediate allocations.
+  ailego_force_inline const void *get_vector_ptr(node_id_t id) const {
+    uint32_t chunk_idx = id >> node_index_mask_bits_;
+    uint32_t offset = (id & node_index_mask_) * node_size();
+    return get_node_chunk_base(chunk_idx) + offset;
+  }
+
  private:
-  inline const char *get_node_chunk_base(uint32_t chunk_idx) const {
+  ailego_force_inline const char *get_node_chunk_base(
+      uint32_t chunk_idx) const {
     if (ailego_unlikely(chunk_idx >= node_chunk_bases_.size())) {
       sync_node_chunk_bases(chunk_idx);
     }
@@ -569,9 +577,11 @@ class VamanaBufferPoolStreamerEntity : public VamanaStreamerEntity {
 };
 
 // --- Typed entity subclass for contiguous memory mode ---
-// Allocates contiguous memory and copies all chunk data into it.
-// Access is via a single base pointer + offset, eliminating chunk-level
-// indirection and maximizing memory locality.
+// Splits node data into two dense arrays during build:
+//   1. vector_base_: flat vector array (stride = vector_size)
+//   2. graph_base_:  key + neighbors  (stride = graph_stride_)
+// Total memory = vector_size + graph_stride_ per node (same as original
+// node_size), but each access pattern gets optimal cache locality.
 class VamanaContiguousStreamerEntity : public VamanaMmapStreamerEntity {
  public:
   using VamanaMmapStreamerEntity::VamanaMmapStreamerEntity;
@@ -592,13 +602,23 @@ class VamanaContiguousStreamerEntity : public VamanaMmapStreamerEntity {
   //! Degrade to mmap mode by releasing contiguous memory and falling back
   //! to chunk-based access.
   void degrade_to_mmap() {
-    node_memory_.reset();
-    node_base_ = nullptr;
+    vector_memory_.reset();
+    vector_base_ = nullptr;
+    vector_stride_ = 0;
+    graph_memory_.reset();
+    graph_base_ = nullptr;
     LOG_INFO("Vamana contiguous entity degraded to mmap mode for insertion");
   }
 
   bool is_contiguous() const {
-    return node_base_ != nullptr;
+    return vector_base_ != nullptr;
+  }
+
+  //! Per-entry stride of the flat vector array (0 if no contiguous build).
+  //! Padded up to kVectorAlignment (64B), so it is also the amount that
+  //! should be prefetched per vector.
+  size_t vector_stride() const {
+    return vector_stride_;
   }
 
   int add_vector(key_t key, const void *vec, node_id_t *id) override {
@@ -611,23 +631,25 @@ class VamanaContiguousStreamerEntity : public VamanaMmapStreamerEntity {
     return VamanaMmapStreamerEntity::add_vector_with_id(id, vec);
   }
 
-  inline TypedNeighbors get_neighbors_typed(node_id_t id) const {
-    if (ailego_likely(node_base_ != nullptr)) {
-      const char *ptr = node_base_ + static_cast<size_t>(id) * node_size() +
-                        vector_size() + sizeof(key_t);
+  ailego_force_inline TypedNeighbors get_neighbors_typed(node_id_t id) const {
+    if (ailego_likely(graph_base_ != nullptr)) {
+      // graph layout: [key (sizeof(key_t)) | NeighborsHeader + neighbors]
+      const char *ptr =
+          graph_base_ + static_cast<size_t>(id) * graph_stride_ + sizeof(key_t);
       MmapMemoryBlock block(const_cast<char *>(ptr));
       return TypedNeighbors(std::move(block));
     }
     return VamanaMmapStreamerEntity::get_neighbors_typed(id);
   }
 
-  inline int get_vector_typed(const node_id_t *ids, uint32_t count,
-                              std::vector<MmapMemoryBlock> &vec_blocks) const {
-    if (ailego_likely(node_base_ != nullptr)) {
+  ailego_force_inline int get_vector_typed(
+      const node_id_t *ids, uint32_t count,
+      std::vector<MmapMemoryBlock> &vec_blocks) const {
+    if (ailego_likely(vector_base_ != nullptr)) {
       vec_blocks.resize(count);
       for (auto i = 0U; i < count; ++i) {
         const char *ptr =
-            node_base_ + static_cast<size_t>(ids[i]) * node_size();
+            vector_base_ + static_cast<size_t>(ids[i]) * vector_stride_;
         vec_blocks[i].reset(const_cast<char *>(ptr));
       }
       return 0;
@@ -635,37 +657,53 @@ class VamanaContiguousStreamerEntity : public VamanaMmapStreamerEntity {
     return VamanaMmapStreamerEntity::get_vector_typed(ids, count, vec_blocks);
   }
 
-  inline key_t get_key_typed(node_id_t id) const {
-    if (ailego_likely(node_base_ != nullptr)) {
+  ailego_force_inline key_t get_key_typed(node_id_t id) const {
+    if (ailego_likely(graph_base_ != nullptr)) {
       if (!use_key_info_map_) return id;
-      const char *ptr =
-          node_base_ + static_cast<size_t>(id) * node_size() + vector_size();
+      // key is at offset 0 within each graph node
+      const char *ptr = graph_base_ + static_cast<size_t>(id) * graph_stride_;
       return *reinterpret_cast<const key_t *>(ptr);
     }
     return VamanaMmapStreamerEntity::get_key_typed(id);
   }
 
+  //! Direct vector pointer from flat vector array.
+  //! Stride is padded up to kVectorAlignment (64B) to preserve cache-line
+  //! alignment even when vector_size is not a multiple of 64.  The padding is
+  //! purely in-memory and does NOT affect the on-disk index file layout.
+  ailego_force_inline const void *get_vector_ptr(node_id_t id) const {
+    if (ailego_likely(vector_base_ != nullptr)) {
+      return vector_base_ + static_cast<size_t>(id) * vector_stride_;
+    }
+    return VamanaMmapStreamerEntity::get_vector_ptr(id);
+  }
+
  protected:
-  //! Custom deleter for contiguous memory (munmap / _aligned_free / free)
+  //! Custom deleter for contiguous memory allocated via
+  //! MemoryHelper::AllocateHugePage. `size` is the (already huge-page-aligned)
+  //! length passed at allocation time, required by the mmap/munmap path.
   struct ContiguousDeleter {
     size_t size;
     void operator()(char *ptr) const {
-      if (!ptr) return;
-#if defined(__linux__) || defined(__APPLE__)
-      ::munmap(ptr, size);
-#elif defined(_WIN32)
-      ::_aligned_free(ptr);
-#else
-      std::free(ptr);
-#endif
+      ailego::MemoryHelper::FreeHugePage(ptr, size);
     }
   };
 
-  //! Shared ownership of contiguous memory (enables zero-copy clone)
-  std::shared_ptr<char> node_memory_{};
-
-  //! Raw pointer for hot-path access (derived from shared_ptr)
-  char *node_base_{nullptr};
+  //! Flat vector array: vectors stored densely with per-vector stride
+  //! padded up to kVectorAlignment (64B) to keep each vector's starting
+  //! address cache-line aligned. Base is page-aligned by the allocator.
+  std::shared_ptr<char> vector_memory_{};
+  char *vector_base_{nullptr};
+  //! Per-vector stride = AlignUp(vector_size(), kVectorAlignment).
+  size_t vector_stride_{0};
+
+  //! Graph array: [key | neighbors] stored densely (stride = graph_stride_).
+  std::shared_ptr<char> graph_memory_{};
+  char *graph_base_{nullptr};
+  size_t graph_stride_{0};  // sizeof(key_t) + neighbors_size()
+
+  //! Cache-line alignment used for per-vector stride in the flat array.
+  static constexpr size_t kVectorAlignment = 64;
 
  private:
   static char *allocate_contiguous(size_t size);
diff --git a/src/core/utility/CMakeLists.txt b/src/core/utility/CMakeLists.txt
index 7c3adf702..97e6fa360 100644
--- a/src/core/utility/CMakeLists.txt
+++ b/src/core/utility/CMakeLists.txt
@@ -6,6 +6,26 @@ if(NOT APPLE)
       "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a")
 endif()
 
+# block_heap.cc uses AVX2 intrinsics (guarded by __AVX2__) for its
+# push_block fast path. When the host toolchain supports it, compile this
+# source with an AVX2-capable -march so the AVX2 codegen is enabled. On
+# other hosts the scalar fallback inside __AVX2__ guards is compiled
+# instead, and callers runtime-gate invocation on CpuFeatures::AVX2.
+if(NOT ANDROID AND AUTO_DETECT_ARCH)
+  if(HOST_ARCH MATCHES "^(x86|x64)$")
+    setup_compiler_march_for_x86(
+      _BLOCK_HEAP_MARCH_SSE _BLOCK_HEAP_MARCH_AVX2
+      _BLOCK_HEAP_MARCH_AVX512 _BLOCK_HEAP_MARCH_AVX512FP16)
+    if(_BLOCK_HEAP_MARCH_AVX2)
+      set_source_files_properties(
+        ${CMAKE_CURRENT_SOURCE_DIR}/block_heap.cc
+        PROPERTIES
+        COMPILE_FLAGS "${_BLOCK_HEAP_MARCH_AVX2}"
+      )
+    endif()
+  endif()
+endif()
+
 cc_library(
     NAME core_utility 
     STATIC SHARED STRICT ALWAYS_LINK
diff --git a/src/core/utility/block_heap.cc b/src/core/utility/block_heap.cc
new file mode 100644
index 000000000..bef50663f
--- /dev/null
+++ b/src/core/utility/block_heap.cc
@@ -0,0 +1,184 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// This translation unit hosts the AVX2-accelerated push_block implementation
+// for BlockHeap. The build system compiles this .cc with an AVX2-capable
+// `-march` when the toolchain/host supports it (see src/core/CMakeLists.txt
+// and src/core/utility/CMakeLists.txt); otherwise the scalar fallback below
+// is used. Callers must still runtime-gate invocation on CpuFeatures::AVX2
+// because a binary compiled on an AVX2 host may be deployed on a lower-arch
+// machine, in which case the AVX2 code here would fault.
+
+// linear_pool.h (pulled in by block_heap.h) uses printf but does not
+// #include <cstdio>; include it here before block_heap.h so the template
+// non-dependent name lookup at definition time succeeds.
+#include "block_heap.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdio>
+#include "zvec/ailego/internal/platform.h"
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+namespace zvec {
+namespace core {
+
+void BlockHeap::reset(int32_t capacity, int32_t block_size) {
+  ef_ = capacity;
+  block_size_ = block_size;
+  data_.clear();
+  const size_t reserve_cnt =
+      static_cast<size_t>(std::max(capacity, block_size)) +
+      static_cast<size_t>(block_size);
+  data_.reserve(reserve_cnt);
+  tmp_.clear();
+  tmp_.reserve(static_cast<size_t>(block_size));
+  cur_ = 0;
+}
+
+uint32_t BlockHeap::pop() {
+  size_t ret_idx = cur_;
+  set_checked(data_[cur_].first);
+  while (cur_ < data_.size() && is_checked(data_[cur_].first)) {
+    ++cur_;
+  }
+  return get_id(data_[ret_idx].first);
+}
+
+void BlockHeap::to_sorted(uint32_t *ids, float *scores, int32_t length) const {
+  const int32_t n = std::min(length, static_cast<int32_t>(data_.size()));
+  for (int32_t i = 0; i < n; ++i) {
+    ids[i] = get_id(data_[i].first);
+    if (scores != nullptr) {
+      scores[i] = data_[i].second;
+    }
+  }
+}
+
+void BlockHeap::push_block(const float *distances, const uint32_t *nodes,
+                           int32_t block_size) {
+  // Phase 1: collect candidates with dist < current threshold into tmp_.
+  if (static_cast<int32_t>(data_.size()) == ef_) {
+    const float max_dist = data_.back().second;
+#if defined(__AVX2__)
+    const __m256 threshold_vec = _mm256_set1_ps(max_dist);
+    int32_t i = 0;
+    for (; i + 8 <= block_size; i += 8) {
+      __m256 d = _mm256_loadu_ps(distances + i);
+      __m256 mask = _mm256_cmp_ps(d, threshold_vec, _CMP_LT_OS);
+      int bitmask = _mm256_movemask_ps(mask);
+      if (bitmask == 0) {
+        continue;
+      }
+      while (bitmask) {
+        int tz = ailego_ctz32(bitmask);
+        tmp_.emplace_back(nodes[i + tz], distances[i + tz]);
+        bitmask &= bitmask - 1;
+      }
+    }
+    for (; i < block_size; ++i) {
+      if (distances[i] < max_dist) {
+        tmp_.emplace_back(nodes[i], distances[i]);
+      }
+    }
+#else
+    for (int32_t i = 0; i < block_size; ++i) {
+      if (distances[i] < max_dist) {
+        tmp_.emplace_back(nodes[i], distances[i]);
+      }
+    }
+#endif
+  } else {
+    for (int32_t i = 0; i < block_size; ++i) {
+      tmp_.emplace_back(nodes[i], distances[i]);
+    }
+  }
+  if (tmp_.empty()) {
+    return;
+  }
+
+  // Phase 2: sort tmp_ ascending by distance (with ef truncation).
+  auto cmp = [](const std::pair<uint32_t, float> &a,
+                const std::pair<uint32_t, float> &b) {
+    return a.second < b.second;
+  };
+  if (static_cast<int32_t>(tmp_.size()) > ef_) {
+    // nth_element + sort of the top-ef slice is O(n) + O(k log k), which is
+    // faster than partial_sort's O(n log k) for the hot path.
+    std::nth_element(tmp_.begin(), tmp_.begin() + ef_, tmp_.end(), cmp);
+    tmp_.resize(static_cast<size_t>(ef_));
+  }
+  if (tmp_.size() <= 32) {
+    // Insertion sort for small arrays — branch-predictor friendly and has
+    // lower overhead than std::sort for tiny inputs.
+    for (size_t i = 1; i < tmp_.size(); ++i) {
+      auto key = tmp_[i];
+      int32_t j = static_cast<int32_t>(i) - 1;
+      while (j >= 0 && tmp_[j].second > key.second) {
+        tmp_[j + 1] = tmp_[j];
+        --j;
+      }
+      tmp_[j + 1] = key;
+    }
+  } else {
+    std::sort(tmp_.begin(), tmp_.end(), cmp);
+  }
+
+  // Phase 3: in-place merge (tail-write) data_ and tmp_, truncated at ef_.
+  const int32_t old_data_size = static_cast<int32_t>(data_.size());
+  const int32_t tmp_size = static_cast<int32_t>(tmp_.size());
+  int32_t i = old_data_size - 1;
+  int32_t j = tmp_size - 1;
+  int32_t write_pos = old_data_size + tmp_size - 1;
+  data_.resize(std::min(static_cast<size_t>(old_data_size + tmp_size),
+                        static_cast<size_t>(ef_)));
+  // Drop the overflow tail (entries past ef_): advance i/j without writing,
+  // since data_[write_pos] would be out of bounds.
+  while (write_pos >= ef_) {
+    if (data_[i].second > tmp_[j].second) {
+      --i;
+    } else {
+      --j;
+    }
+    --write_pos;
+  }
+  // Merge phase: consume the larger of data_[i]/tmp_[j] into data_[write_pos].
+  while (i >= 0 && j >= 0) {
+    if (data_[i].second > tmp_[j].second) {
+      data_[write_pos--] = data_[i--];
+    } else {
+      data_[write_pos--] = tmp_[j--];
+    }
+  }
+  if (j >= 0) {
+    // tmp_ entries remaining at front — copy them and reset cursor so the
+    // caller re-scans from the head.
+    while (j >= 0) {
+      data_[write_pos--] = tmp_[j--];
+    }
+    cur_ = 0;
+  } else {
+    // All tmp_ entries consumed; old data_[0..i] are already in place.
+    // Move cursor back if new items were inserted ahead of it.
+    if (static_cast<size_t>(write_pos + 1) <= cur_) {
+      cur_ = static_cast<size_t>(write_pos + 1);
+    }
+  }
+  tmp_.clear();
+}
+
+}  // namespace core
+}  // namespace zvec
diff --git a/src/core/utility/block_heap.h b/src/core/utility/block_heap.h
new file mode 100644
index 000000000..ba3b07ee1
--- /dev/null
+++ b/src/core/utility/block_heap.h
@@ -0,0 +1,116 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace zvec {
+namespace core {
+
+// BlockHeap is a block-insert optimized alternative to LinearPool for graph
+// search. It receives candidates in batches (push_block) and maintains a
+// distance-sorted prefix of size ef with amortized O(k) bookkeeping per
+// batch, replacing LinearPool's one-by-one sorted insert.
+//
+// Derived from pyglass' BlockHeap (https://github.com/zilliztech/pyglass,
+// MIT License; see the NOTICE file and linear_pool.h for the full attribution).
+// The graph prefetch is intentionally omitted: the call-site is expected to
+// issue the neighbor-array prefetch itself (Vamana's greedy_search already
+// does so).
+//
+// AVX2 requirement
+// ----------------
+// The implementation uses AVX2 intrinsics in push_block for the common case
+// where the pool is full and we need to filter a block of candidates against
+// the current distance threshold. The intrinsics are confined to
+// block_heap.cc and guarded with `#if defined(__AVX2__)`, so this header is
+// always safe to include. Callers MUST gate the invocation of BlockHeap-based
+// code paths on a runtime CpuFeatures::AVX2 check to avoid illegal
+// instructions when running on a low-arch machine with a binary built on a
+// higher-arch host.
+struct BlockHeap {
+  BlockHeap() = default;
+  ~BlockHeap() = default;
+
+  BlockHeap(const BlockHeap &) = delete;
+  BlockHeap &operator=(const BlockHeap &) = delete;
+
+  BlockHeap(BlockHeap &&) = default;
+  BlockHeap &operator=(BlockHeap &&) = default;
+
+  // Reset the pool state for a new search round.  `capacity` is the retained
+  // top-k size, `block_size` is an upper bound on the per-call push_block size
+  // (used only for capacity hints).  Visited-node tracking is no longer owned
+  // by the pool — the caller passes a VisitFilter reference instead.
+  void reset(int32_t capacity, int32_t block_size);
+
+  // Insert a block of candidates. The distance array must have at least
+  // `block_size` entries and the id array must have the same length.
+  // `block_size` may differ from the value passed to reset(); reset()'s
+  // block_size is only a capacity hint.
+  void push_block(const float *distances, const uint32_t *nodes,
+                  int32_t block_size);
+
+  // Is there an unpopped candidate?
+  bool has_next() const {
+    return cur_ < data_.size();
+  }
+
+  // Pop the closest unpopped candidate id (without the check bit).
+  // Caller must ensure has_next() is true.
+  uint32_t pop();
+
+  // Retained candidate count.
+  int32_t size() const {
+    return static_cast<int32_t>(data_.size());
+  }
+
+  // Export sorted top-`length` ids (and optionally scores) — data_ is already
+  // distance-sorted ascending.
+  void to_sorted(uint32_t *ids, float *scores, int32_t length) const;
+
+  // Direct sorted accessors (used by search result copy-out).
+  uint32_t id(int32_t i) const {
+    return get_id(data_[i].first);
+  }
+  float dist(int32_t i) const {
+    return data_[i].second;
+  }
+
+  // Internal check-bit helpers (high bit marks a popped entry).
+  static constexpr uint32_t kCheckedBit = 0x80000000u;
+  static constexpr uint32_t kIdMask = 0x7FFFFFFFu;
+
+  static void set_checked(uint32_t &id) {
+    id |= kCheckedBit;
+  }
+  static bool is_checked(uint32_t id) {
+    return (id & kCheckedBit) != 0u;
+  }
+  static uint32_t get_id(uint32_t id) {
+    return id & kIdMask;
+  }
+
+ private:
+  std::vector<std::pair<uint32_t, float>> data_;
+  std::vector<std::pair<uint32_t, float>> tmp_;
+  int32_t ef_{0};
+  int32_t block_size_{0};
+  size_t cur_{0};
+};
+
+}  // namespace core
+}  // namespace zvec
diff --git a/src/core/utility/linear_pool.h b/src/core/utility/linear_pool.h
new file mode 100644
index 000000000..20c3bc8c4
--- /dev/null
+++ b/src/core/utility/linear_pool.h
@@ -0,0 +1,237 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// ===========================================================================
+// Acknowledgement
+// ---------------
+// The LinearPool implementation in this file (and the accompanying Neighbor
+// helper) is adapted from the pyglass project, with modifications
+// (e.g. a BlockHeap-compatible reset()/push_block() interface):
+//
+//     pyglass — Graph Library for Approximate Similarity Search
+//     https://github.com/zilliztech/pyglass
+//
+// pyglass is distributed under the MIT License. The original copyright notice
+// and permission notice are reproduced below as required by that license:
+//
+//     MIT License
+//
+//     Copyright (c) 2023 zh Wang
+//
+//     Permission is hereby granted, free of charge, to any person obtaining a
+//     copy of this software and associated documentation files (the
+//     "Software"), to deal in the Software without restriction, including
+//     without limitation the rights to use, copy, modify, merge, publish,
+//     distribute, sublicense, and/or sell copies of the Software, and to
+//     permit persons to whom the Software is furnished to do so, subject to
+//     the following conditions:
+//
+//     The above copyright notice and this permission notice shall be included
+//     in all copies or substantial portions of the Software.
+//
+//     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+//     OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+//     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+//     IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+//     CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+//     TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+//     SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// ===========================================================================
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace zvec {
+namespace core {
+
+namespace linear_pool_impl {
+
+template <typename dist_t = float>
+struct Neighbor {
+  int id;
+  dist_t distance;
+
+  Neighbor() = default;
+  Neighbor(int id, dist_t distance) : id(id), distance(distance) {}
+
+  inline friend bool operator<(const Neighbor &lhs, const Neighbor &rhs) {
+    return lhs.distance < rhs.distance ||
+           (lhs.distance == rhs.distance && lhs.id < rhs.id);
+  }
+
+  inline friend bool operator>(const Neighbor &lhs, const Neighbor &rhs) {
+    return !(lhs < rhs);
+  }
+};
+
+}  // namespace linear_pool_impl
+
+template <typename dist_t>
+struct LinearPool {
+  using dist_type = dist_t;
+
+  LinearPool() = default;
+
+  LinearPool(int ef, int capacity)
+      : ef_(ef), capacity_(capacity), data_(capacity_ + 1) {}
+
+  friend void swap(LinearPool &lhs, LinearPool &rhs) {
+    using std::swap;
+    swap(lhs.size_, rhs.size_);
+    swap(lhs.cur_, rhs.cur_);
+    swap(lhs.ef_, rhs.ef_);
+    swap(lhs.capacity_, rhs.capacity_);
+    swap(lhs.data_, rhs.data_);
+  }
+
+  LinearPool(const LinearPool &) = delete;
+
+  LinearPool(LinearPool &&rhs) {
+    swap(*this, rhs);
+  }
+
+  LinearPool &operator=(const LinearPool &) = delete;
+
+  LinearPool &operator=(LinearPool &&rhs) {
+    swap(*this, rhs);
+    return *this;
+  }
+
+  // Reset the pool state for a new search round.  `capacity` is the retained
+  // top-k size, `block_size` is ignored (kept for API parity with BlockHeap).
+  // Visited-node tracking is no longer owned by the pool — the caller passes
+  // a VisitFilter reference to the search loop instead.
+  void reset(int32_t capacity, int32_t /*block_size_ignored*/) {
+    size_ = cur_ = 0;
+    ef_ = capacity;
+    capacity_ = capacity;
+    if (data_.size() < static_cast<size_t>(capacity + 1)) {
+      data_.resize(capacity + 1);
+    }
+  }
+
+  ailego_force_inline int find_bsearch(dist_t dist) {
+    int lo = 0, hi = size_;
+    while (lo < hi) {
+      int mid = (lo + hi) / 2;
+      if (data_[mid].distance > dist) {
+        hi = mid;
+      } else {
+        lo = mid + 1;
+      }
+    }
+    return lo;
+    // int len = size_;
+    // int loc = 0;
+    // while (len > 1) {
+    //   int half = len / 2;
+    //   loc += (dist > data_[loc + half - 1].distance) * half;
+    //   len -= half;
+    // }
+    // return loc;
+  }
+
+  // Block-insert interface matching BlockHeap::push_block: insert each
+  // (node, distance) pair via the one-by-one sorted insert().  Used by the
+  // templated greedy_search helpers so that LinearPool can be plugged in
+  // when AVX2 is unavailable.
+  void push_block(const float *distances, const uint32_t *nodes,
+                  int32_t block_size) {
+    for (int32_t i = 0; i < block_size; ++i) {
+      insert(static_cast<int>(nodes[i]), static_cast<dist_t>(distances[i]));
+    }
+  }
+
+  ailego_force_inline bool insert(int u, dist_t dist) {
+    if (size_ == capacity_ && dist >= data_[size_ - 1].distance) {
+      return false;
+    }
+    int lo = find_bsearch(dist);
+    std::memmove(&data_[lo + 1], &data_[lo],
+                 (size_ - lo) * sizeof(linear_pool_impl::Neighbor<dist_t>));
+    data_[lo] = {u, dist};
+    if (size_ < capacity_) {
+      size_++;
+    }
+    if (lo < cur_) {
+      cur_ = lo;
+    }
+    return true;
+  }
+
+  int pop() {
+    set_checked(data_[cur_].id);
+    int pre = cur_;
+    while (cur_ < size_ && is_checked(data_[cur_].id)) {
+      cur_++;
+    }
+    return get_id(data_[pre].id);
+  }
+
+  bool has_next() const {
+    return cur_ < size_ && cur_ < ef_;
+  }
+  int id(int i) const {
+    return get_id(data_[i].id);
+  }
+  dist_type dist(int i) const {
+    return data_[i].distance;
+  }
+  int size() const {
+    return size_;
+  }
+  int capacity() const {
+    return capacity_;
+  }
+
+  constexpr static int kMask = 2147483647;
+  int get_id(int id) const {
+    return id & kMask;
+  }
+  void set_checked(int &id) {
+    id |= 1 << 31;
+  }
+  bool is_checked(int id) const {
+    return id >> 31 & 1;
+  }
+
+  void to_sorted(int32_t *ids, float *scores, int32_t length) const {
+    for (int32_t i = 0; i < length; ++i) {
+      ids[i] = id(i);
+      if (scores) {
+        scores[i] = dist(i);
+      }
+    }
+  }
+
+  int size_ = 0, cur_ = 0, ef_ = 0, capacity_ = 0;
+  std::vector<linear_pool_impl::Neighbor<dist_t>> data_;
+};
+
+// Copy a single-heap pool's (LinearPool/BlockHeap) distance-sorted retained
+// results into a topk heap. Both the pool and the topk heap are template
+// parameters so this helper is independent of any per-algorithm TopkHeap alias
+// and can be shared by Vamana and HNSW greedy search.
+template <typename PoolType, typename TopkType>
+void copy_pool_to_topk(const PoolType &pool, TopkType &topk) {
+  const int32_t n = static_cast<int32_t>(pool.size());
+  for (int32_t i = 0; i < n; ++i) {
+    topk.emplace(pool.id(i), pool.dist(i));
+  }
+}
+
+}  // namespace core
+}  // namespace zvec
diff --git a/src/include/zvec/ailego/internal/platform.h b/src/include/zvec/ailego/internal/platform.h
index d30cb8865..2b1c1a0e1 100644
--- a/src/include/zvec/ailego/internal/platform.h
+++ b/src/include/zvec/ailego/internal/platform.h
@@ -219,6 +219,13 @@ static inline int ailego_clz64(uint64_t x) {
 #define ailego_prefetch(p) (__builtin_prefetch((p)))
 #endif  // _MSC_VER
 
+//! Force-inline hint: use on hot-path accessors (3-5 instructions).
+#ifdef _MSC_VER
+#define ailego_force_inline __forceinline
+#else
+#define ailego_force_inline inline __attribute__((always_inline))
+#endif
+
 #if defined(AILEGO_M64)
 #define ailego_ctz ailego_ctz64
 #define ailego_clz ailego_clz64
diff --git a/src/turbo/avx512_vnni/record_quantized_int8/common.h b/src/turbo/avx512_vnni/record_quantized_int8/common.h
index 465caf6cf..e72c8f014 100644
--- a/src/turbo/avx512_vnni/record_quantized_int8/common.h
+++ b/src/turbo/avx512_vnni/record_quantized_int8/common.h
@@ -26,12 +26,7 @@
 #include <immintrin.h>
 #include <array>
 #include <cstdint>
-
-#ifdef _MSC_VER
-#define TURBO_ALWAYS_INLINE __forceinline
-#else
-#define TURBO_ALWAYS_INLINE inline __attribute__((always_inline))
-#endif
+#include <zvec/ailego/internal/platform.h>
 
 namespace zvec::turbo::avx512_vnni::internal {
 
@@ -48,7 +43,7 @@ static inline int32_t HorizontalAdd_INT32_V256(__m256i v) {
 // Compute the raw integer inner product of two int8 vectors of length `size`.
 // The result is written to `*distance` as a float.
 // Both `a` and `b` must point to int8_t arrays.
-static TURBO_ALWAYS_INLINE void ip_int8_avx512_vnni(const void *a,
+static ailego_force_inline void ip_int8_avx512_vnni(const void *a,
                                                     const void *b, size_t size,
                                                     float *distance) {
   const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001);
@@ -219,7 +214,7 @@ static TURBO_ALWAYS_INLINE void ip_int8_avx512_vnni(const void *a,
 // Shift the first `original_dim` bytes of `query` in-place from int8 to uint8
 // by adding 128 to each element. The metadata tail beyond `original_dim` is
 // left untouched. This prepares the query for use with dpbusd (uint8 * int8).
-static TURBO_ALWAYS_INLINE void shift_int8_to_uint8_avx512(
+static ailego_force_inline void shift_int8_to_uint8_avx512(
     void *query, size_t original_dim) {
   const int8_t *input = reinterpret_cast<const int8_t *>(query);
   uint8_t *output = reinterpret_cast<uint8_t *>(query);
@@ -244,7 +239,7 @@ static TURBO_ALWAYS_INLINE void shift_int8_to_uint8_avx512(
 // single query. Uses AVX512-VNNI dpbusd instruction.
 // `query` is treated as uint8 (preprocessed), `vectors[i]` as int8.
 template <size_t batch_size>
-TURBO_ALWAYS_INLINE void ip_int8_batch_avx512_vnni_impl(
+ailego_force_inline void ip_int8_batch_avx512_vnni_impl(
     const void *query, const void *const *vectors,
     const std::array<const void *, batch_size> &prefetch_ptrs,
     size_t dimensionality, float *distances) {
@@ -289,7 +284,7 @@ TURBO_ALWAYS_INLINE void ip_int8_batch_avx512_vnni_impl(
 }
 
 // Dispatch batched inner product over all `n` vectors with prefetching.
-static TURBO_ALWAYS_INLINE void ip_int8_batch_avx512_vnni(
+static ailego_force_inline void ip_int8_batch_avx512_vnni(
     const void *const *vectors, const void *query, size_t n, size_t dim,
     float *distances) {
   static constexpr size_t batch_size = 2;