diff --git a/NOTICE b/NOTICE new file mode 100644 index 000000000..1ea6ef8bc --- /dev/null +++ b/NOTICE @@ -0,0 +1,48 @@ +zvec +Copyright 2025-present the zvec project + +This product is licensed under the Apache License, Version 2.0 (see the LICENSE +file). It includes third-party software components that are distributed under +their own licenses, as listed below. + +================================================================================ +Third-Party Components +================================================================================ + +-------------------------------------------------------------------------------- +pyglass +-------------------------------------------------------------------------------- +Project: pyglass — Graph Library for Approximate Similarity Search +Homepage: https://github.com/zilliztech/pyglass +License: MIT License +Used in: src/core/utility/linear_pool.h + +The LinearPool implementation (and the accompanying Neighbor / Bitset helpers) +in src/core/utility/linear_pool.h is adapted from pyglass, with modifications +(a BlockHeap-compatible reset()/push_block() interface and the use of +MemoryHelper for huge-page-backed allocation). The related BlockHeap design in +src/core/utility/block_heap.{h,cc} is also derived from pyglass. + +Original license text: + + MIT License + + Copyright (c) 2023 zh Wang + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. diff --git a/src/ailego/utility/memory_helper.cc b/src/ailego/utility/memory_helper.cc index eee78e407..168c38b2e 100644 --- a/src/ailego/utility/memory_helper.cc +++ b/src/ailego/utility/memory_helper.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "memory_helper.h" +#include #include #include #include @@ -29,6 +30,7 @@ #include #include #endif +#include #include #endif @@ -391,5 +393,100 @@ size_t MemoryHelper::HugePageSize(void) { return page_size; } +size_t MemoryHelper::AlignHugePageSize(size_t size) { + const size_t page_mask = HugePageSize() - 1; + return (size + page_mask) & (~page_mask); +} + +void *MemoryHelper::AllocateHugePage(size_t size, bool zero_fill) { + if (size == 0) { + return nullptr; + } + const size_t aligned_size = AlignHugePageSize(size); + +#if defined(_WIN64) || defined(_WIN32) + void *ptr = ::_aligned_malloc(aligned_size, PageSize()); + if (ptr == nullptr) { + return nullptr; + } + if (zero_fill) { + std::memset(ptr, 0, aligned_size); + } + return ptr; +#else + void *ptr = ::mmap(nullptr, aligned_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (ptr == MAP_FAILED) { + return nullptr; + } + // MADV_HUGEPAGE is a Linux-only hint for transparent huge pages. On + // macOS/BSD (which manage superpages differently) it is intentionally + // absent; skipping it only forgoes a performance hint, not correctness. +#if defined(MADV_HUGEPAGE) + ::madvise(ptr, aligned_size, MADV_HUGEPAGE); +#endif + // mmap with MAP_ANONYMOUS already returns zero-filled pages, so an explicit + // memset is only needed when the caller relies on it for a non-anonymous + // fallback; here it is redundant and skipped to avoid touching every page. + (void)zero_fill; + return ptr; +#endif +} + +void MemoryHelper::FreeHugePage(void *ptr, size_t size) { + if (ptr == nullptr) { + return; + } +#if defined(_WIN64) || defined(_WIN32) + (void)size; + ::_aligned_free(ptr); +#else + ::munmap(ptr, AlignHugePageSize(size)); +#endif +} + +void *MemoryHelper::AllocateAligned(size_t size, size_t alignment, + bool zero_fill) { + assert(alignment != 0 && (alignment & (alignment - 1)) == 0 && + "alignment must be a power of two"); + if (size == 0) { + return nullptr; + } + if (size >= HugePageSize()) { + return AllocateHugePage(size, zero_fill); + } + + // Small block: a regular aligned allocation avoids reserving a whole huge + // page. std::aligned_alloc requires the size to be a multiple of alignment. + const size_t aligned_size = (size + alignment - 1) / alignment * alignment; +#if defined(_WIN64) || defined(_WIN32) + void *ptr = ::_aligned_malloc(aligned_size, alignment); +#else + void *ptr = std::aligned_alloc(alignment, aligned_size); +#endif + if (ptr == nullptr) { + return nullptr; + } + if (zero_fill) { + std::memset(ptr, 0, aligned_size); + } + return ptr; +} + +void MemoryHelper::FreeAligned(void *ptr, size_t size) { + if (ptr == nullptr) { + return; + } + if (size >= HugePageSize()) { + FreeHugePage(ptr, size); + return; + } +#if defined(_WIN64) || defined(_WIN32) + ::_aligned_free(ptr); +#else + std::free(ptr); +#endif +} + } // namespace ailego -} // namespace zvec \ No newline at end of file +} // namespace zvec diff --git a/src/ailego/utility/memory_helper.h b/src/ailego/utility/memory_helper.h index b6104a842..ee686292b 100644 --- a/src/ailego/utility/memory_helper.h +++ b/src/ailego/utility/memory_helper.h @@ -28,6 +28,65 @@ struct MemoryHelper { //! Retrieve the huge page size of memory static size_t HugePageSize(void); + //! Round `size` up to a multiple of the huge page size. + static size_t AlignHugePageSize(size_t size); + + //! Allocate a large, page-aligned block that prefers transparent huge pages. + //! + //! On Linux the block is obtained via anonymous mmap and hinted with + //! MADV_HUGEPAGE; on other platforms it falls back to a page-aligned + //! allocation without the huge-page hint (which is a performance hint, not a + //! correctness requirement). Returns nullptr on failure. + //! + //! `size` is rounded up to the huge page size internally, and the same + //! rounded value is what the corresponding FreeHugePage call expects, so + //! callers should treat the returned block as exactly AlignHugePageSize(size) + //! bytes. + //! + //! `zero_fill` requests zeroed memory: when true the returned block is + //! guaranteed to be zero-initialized. When false the caller does not require + //! zeroing, but the implementation is still free to return zeroed memory and + //! does so on the anonymous-mmap path (MAP_ANONYMOUS pages are always zero), + //! where an explicit memset is skipped to preserve lazy paging. In other + //! words, true => always zeroed; false => zeroing is not guaranteed either + //! way. Never assume non-zero contents. + //! + //! Blocks returned here MUST be released with FreeHugePage (never free()), + //! because the underlying allocator differs per platform. + static void *AllocateHugePage(size_t size, bool zero_fill = true); + + //! Release a block previously returned by AllocateHugePage. + //! + //! `size` must be the same value originally passed to AllocateHugePage; it is + //! required because the Linux mmap path needs the length for munmap. + static void FreeHugePage(void *ptr, size_t size); + + //! Allocate an aligned block, choosing the backing allocator by size. + //! + //! When `size` is at least the huge page size, the block is obtained via + //! AllocateHugePage (huge-page-backed, page-aligned). Otherwise a regular + //! `alignment`-aligned allocation is used, which avoids wasting a full huge + //! page on small buffers. Returns nullptr on failure. + //! + //! `alignment` must be a power of two. + //! + //! `zero_fill` follows the same contract as AllocateHugePage: true guarantees + //! zeroed memory; false does not require zeroing but the implementation may + //! still return zeroed memory (it does on the huge-page mmap path). Never + //! assume non-zero contents. + //! + //! Blocks returned here MUST be released with FreeAligned, passing the same + //! `size`, because the chosen allocator (and therefore the matching free) is + //! derived from `size`. + static void *AllocateAligned(size_t size, size_t alignment = 64, + bool zero_fill = true); + + //! Release a block previously returned by AllocateAligned. + //! + //! `size` must be the same value originally passed to AllocateAligned so the + //! same allocator path is selected for releasing the block. + static void FreeAligned(void *ptr, size_t size); + //! Retrieve the VSZ and RSS of self process in bytes static bool SelfUsage(size_t *vsz, size_t *rss); diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 2b676787d..13a974b9a 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -21,6 +21,27 @@ if(RABITQ_SUPPORTED AND AUTO_DETECT_ARCH) endforeach() endif() +# utility/block_heap.cc uses AVX2 intrinsics guarded by __AVX2__. When the +# host toolchain supports it, compile this source with an AVX2-capable +# -march so AVX2 codegen is emitted. zvec_core glob-collects this source +# too, so per-file flags must be set here as well (in addition to the +# core_utility target in utility/CMakeLists.txt). Callers runtime-gate +# invocation of BlockHeap paths on CpuFeatures::AVX2. +if(NOT ANDROID AND AUTO_DETECT_ARCH) + if(HOST_ARCH MATCHES "^(x86|x64)$") + setup_compiler_march_for_x86( + _BLOCK_HEAP_MARCH_SSE _BLOCK_HEAP_MARCH_AVX2 + _BLOCK_HEAP_MARCH_AVX512 _BLOCK_HEAP_MARCH_AVX512FP16) + if(_BLOCK_HEAP_MARCH_AVX2) + set_source_files_properties( + utility/block_heap.cc + PROPERTIES + COMPILE_FLAGS "${_BLOCK_HEAP_MARCH_AVX2}" + ) + endif() + endif() +endif() + cc_directory(framework) cc_directory(algorithm) cc_directory(metric) diff --git a/src/core/algorithm/hnsw/CMakeLists.txt b/src/core/algorithm/hnsw/CMakeLists.txt index cfd1147f4..d59dc8ffc 100644 --- a/src/core/algorithm/hnsw/CMakeLists.txt +++ b/src/core/algorithm/hnsw/CMakeLists.txt @@ -10,7 +10,7 @@ cc_library( NAME core_knn_hnsw STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc - LIBS core_framework sparsehash + LIBS core_framework core_utility sparsehash INCS . ${PROJECT_ROOT_DIR}/src/core ${PROJECT_ROOT_DIR}/src/core/algorithm LDFLAGS "${CORE_KNN_HNSW_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" diff --git a/src/core/algorithm/hnsw/hnsw_algorithm.cc b/src/core/algorithm/hnsw/hnsw_algorithm.cc index 75fe5a933..0480cc4f0 100644 --- a/src/core/algorithm/hnsw/hnsw_algorithm.cc +++ b/src/core/algorithm/hnsw/hnsw_algorithm.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "hnsw_algorithm.h" +#include namespace zvec { namespace core { @@ -48,7 +49,7 @@ int HnswAlgorithm::add_node(node_id_t id, level_t level, for (; cur_level >= 0; --cur_level) { search_neighbors(cur_level, &entry_point, &dist, ctx->level_topk(cur_level), - ctx); + ctx, /*use_pool=*/false); } // add neighbors from down level to top level, to avoid upper level visible @@ -86,7 +87,7 @@ int HnswAlgorithm::search(HnswContext *ctx) const { auto &topk_heap = ctx->topk_heap(); topk_heap.clear(); - search_neighbors(0, &entry_point, &dist, topk_heap, ctx); + search_neighbors(0, &entry_point, &dist, topk_heap, ctx, /*use_pool=*/true); if (ctx->group_by_search()) { expand_neighbors_by_group(topk_heap, ctx); @@ -170,21 +171,117 @@ void HnswAlgorithm::add_neighbors(node_id_t id, level_t level, return; } -template -void HnswAlgorithm::search_neighbors(level_t level, - node_id_t *entry_point, - dist_t *dist, TopkHeap &topk, - HnswContext *ctx) const { - const auto &entity = static_cast(ctx->get_entity()); - HnswDistCalculator &dc = ctx->dist_calculator(); +// ============================================================================ +// search_neighbors helper templates +// +// Two specialized inner loops, dispatched from search_neighbors(): +// +// fast_search_neighbors: mmap/contiguous with direct vector pointers. +// Uses BlockHeap (AVX2) or LinearPool (scalar) +// for visited tracking and top-k maintenance. +// dual_heap_search_neighbors: CandidateHeap + TopkHeap + VisitFilter. +// Used for add_node (use_pool=false), filtered +// search, upper levels, and BufferPool fallback. +// ============================================================================ + +// mmap/contiguous variant: resolve vectors via get_vector_ptr and use +// LinearPool or BlockHeap for visited tracking + top-k maintenance. +// HeapType must expose reset/set_visited/check_visited/push_block/has_next/pop. +template +void fast_search_neighbors(const EntityType &entity, HeapType &pool, + VisitFilter &visit, HnswDistCalculator &dc, + uint32_t topk, uint32_t ef, node_id_t entry_point, + dist_t entry_dist, uint32_t prefetch_lines) { + const uint32_t max_deg = entity.max_degree(0); // level 0 only + const uint32_t cap = std::max(topk, ef); + pool.reset(static_cast(cap), static_cast(max_deg)); + visit.clear(); + + visit.set_visited(entry_point); + pool.push_block(&entry_dist, &entry_point, 1); + + static constexpr uint32_t GRAPH_PO = 8; + + uint32_t buf_capacity = max_deg; + std::vector neighbor_ids(buf_capacity); + std::vector dists(buf_capacity); + std::vector neighbor_vecs(buf_capacity); + + while (pool.has_next()) { + auto current_node = pool.pop(); + + const auto neighbors = entity.get_neighbors_typed(0, current_node); + ailego_prefetch(neighbors.data); + + if (neighbors.size() > buf_capacity) { + buf_capacity = neighbors.size(); + neighbor_ids.resize(buf_capacity); + dists.resize(buf_capacity); + neighbor_vecs.resize(buf_capacity); + } + + const uint32_t po = + std::min(static_cast(neighbors.size()), GRAPH_PO); + uint32_t unvisited_count = 0; + uint32_t i = 0; + + // Phase 1: scan first `po` neighbors with prefetch. + for (; i < po; ++i) { + node_id_t node = neighbors[i]; + if (visit.visited(node)) continue; + visit.set_visited(node); + const void *vec_ptr = entity.get_vector_ptr(node); + const char *p = reinterpret_cast(vec_ptr); + for (uint32_t cl = 0; cl < prefetch_lines; ++cl) { + ailego_prefetch(p + cl * 64); + } + neighbor_ids[unvisited_count] = node; + neighbor_vecs[unvisited_count] = vec_ptr; + unvisited_count++; + } + + // Phase 2: scan remaining neighbors. + for (; i < neighbors.size(); ++i) { + node_id_t node = neighbors[i]; + if (visit.visited(node)) continue; + visit.set_visited(node); + neighbor_ids[unvisited_count] = node; + neighbor_vecs[unvisited_count] = entity.get_vector_ptr(node); + unvisited_count++; + } + + if (unvisited_count == 0) continue; + dc.batch_dist(neighbor_vecs.data(), unvisited_count, dists.data()); + + pool.push_block(dists.data(), neighbor_ids.data(), + static_cast(unvisited_count)); + } +} + +// ============================================================================ +// dual_heap_search_neighbors: shared core for the fallback dual-heap path. +// +// Maintains a candidate min-heap + topk heap + VisitFilter. Supports +// arbitrary levels, filters, and MemoryBlock types (BufferPool/Mmap). +// Also updates entry_point/dist for next-level continuation. +// ============================================================================ +template +void dual_heap_search_neighbors(const EntityType &entity, level_t level, + node_id_t *entry_point, dist_t *dist, + TopkHeap &topk, HnswContext *ctx, + HnswDistCalculator &dc, FilterFn &&filter) { + static constexpr uint32_t BATCH_SIZE = 12; + static constexpr uint32_t PREFETCH_STEP = 2; + + uint32_t buf_capacity = entity.max_degree(level); + std::vector neighbor_ids(buf_capacity); + std::vector neighbor_vec_blocks; + neighbor_vec_blocks.reserve(buf_capacity); + std::vector dists(buf_capacity); + std::vector neighbor_vecs(buf_capacity); + VisitFilter &visit = ctx->visit_filter(); CandidateHeap &candidates = ctx->candidates(); - std::function filter = [](node_id_t) { return false; }; - if (ctx->filter().is_valid()) { - filter = [&](node_id_t id) { - return ctx->filter()(entity.get_key_typed(id)); - }; - } candidates.clear(); visit.clear(); @@ -210,7 +307,14 @@ void HnswAlgorithm::search_neighbors(level_t level, (*ctx->mutable_stats_get_neighbors())++; } - std::vector neighbor_ids(neighbors.size()); + if (neighbors.size() > buf_capacity) { + buf_capacity = neighbors.size(); + neighbor_ids.resize(buf_capacity); + neighbor_vec_blocks.resize(buf_capacity); + dists.resize(buf_capacity); + neighbor_vecs.resize(buf_capacity); + } + uint32_t size = 0; for (uint32_t i = 0; i < neighbors.size(); ++i) { node_id_t node = neighbors[i]; @@ -227,7 +331,7 @@ void HnswAlgorithm::search_neighbors(level_t level, continue; } - std::vector neighbor_vec_blocks; + neighbor_vec_blocks.clear(); int ret = entity.get_vector_typed(neighbor_ids.data(), size, neighbor_vec_blocks); if (ailego_unlikely(ctx->debugging())) { @@ -238,15 +342,9 @@ void HnswAlgorithm::search_neighbors(level_t level, } // do prefetch - static constexpr node_id_t BATCH_SIZE = 12; - static constexpr node_id_t PREFETCH_STEP = 2; for (uint32_t i = 0; i < std::min(BATCH_SIZE * PREFETCH_STEP, size); ++i) { ailego_prefetch(neighbor_vec_blocks[i].data()); } - // done - - std::vector dists(size); - std::vector neighbor_vecs(size); for (uint32_t i = 0; i < size; ++i) { neighbor_vecs[i] = neighbor_vec_blocks[i].data(); @@ -268,11 +366,77 @@ void HnswAlgorithm::search_neighbors(level_t level, if (!filter(node)) { topk.emplace(node, cur_dist); } - } // end if - } // end for - } // while + } + } + } +} - return; +// ============================================================================ +// search_neighbors: Dispatch to fast or dual-heap path. +// +// - add_node / filtered / upper levels → dual_heap_search_neighbors +// - level-0 unfiltered search: +// MmapMemoryBlock → fast_search_neighbors (BlockHeap/LinearPool) +// BufferPool → dual_heap_search_neighbors (fallback) +// ============================================================================ +template +void HnswAlgorithm::search_neighbors(level_t level, + node_id_t *entry_point, + dist_t *dist, TopkHeap &topk, + HnswContext *ctx, + bool use_pool) const { + const auto &entity = static_cast(ctx->get_entity()); + HnswDistCalculator &dc = ctx->dist_calculator(); + + const uint32_t prefetch_lines = (entity.vector_size() + 63) / 64; + + if (!use_pool || ctx->filter().is_valid() || level != 0) { + // Dual-heap path: add_node, filtered search, or upper-level scan. + auto run_with_filter = [&](auto &&filter) { + dual_heap_search_neighbors( + entity, level, entry_point, dist, topk, ctx, dc, + std::forward(filter)); + }; + + if (ctx->filter().is_valid()) { + auto filter = [&](node_id_t id) { + return ctx->filter()(entity.get_key_typed(id)); + }; + run_with_filter(filter); + } else { + auto filter = [](node_id_t) { return false; }; + run_with_filter(filter); + } + } else { + // Pool-based path for level-0 unfiltered search. + if constexpr (std::is_same_v) { + // Fast path: direct pointer access via get_vector_ptr. + // BlockHeap (AVX2) or LinearPool (scalar) for top-k tracking. + const uint32_t topk_v = static_cast(ctx->topk()); + const uint32_t ef_v = ctx->ef(); + const bool avx2_ok = + zvec::ailego::internal::CpuFeatures::static_flags_.AVX2; + + auto &visit = ctx->visit_filter(); + + if (avx2_ok) { + auto &bpool = ctx->block_pool(); + fast_search_neighbors(entity, bpool, visit, dc, topk_v, ef_v, + *entry_point, *dist, prefetch_lines); + copy_pool_to_topk(bpool, topk); + } else { + auto &lpool = ctx->pool(); + fast_search_neighbors(entity, lpool, visit, dc, topk_v, ef_v, + *entry_point, *dist, prefetch_lines); + copy_pool_to_topk(lpool, topk); + } + } else { + // BufferPool entities: fallback to dual-heap path. + auto filter = [](node_id_t) { return false; }; + dual_heap_search_neighbors( + entity, level, entry_point, dist, topk, ctx, dc, filter); + } + } } template @@ -390,9 +554,9 @@ void HnswAlgorithm::expand_neighbors_by_group( } candidates.emplace(node, cur_dist); - } // end for - } // end while - } // end if + } + } + } } template diff --git a/src/core/algorithm/hnsw/hnsw_algorithm.h b/src/core/algorithm/hnsw/hnsw_algorithm.h index ea73cd708..7851b1c52 100644 --- a/src/core/algorithm/hnsw/hnsw_algorithm.h +++ b/src/core/algorithm/hnsw/hnsw_algorithm.h @@ -111,11 +111,14 @@ class HnswAlgorithm : public HnswAlgorithmBase { void add_neighbors(node_id_t id, level_t level, TopkHeap &topk_heap, HnswContext *ctx); - //! Given a node id and level, search the nearest neighbors in graph - //! Note: the nearest neighbors result keeps in topk, and entry_point and - //! dist will be updated to current level nearest node id and distance + //! Given a node id and level, search the nearest neighbors in graph. + //! Dispatches to fast_search_neighbors (pool-based, direct pointer) for + //! mmap/contiguous level-0 unfiltered search, or dual_heap_search_neighbors + //! (CandidateHeap + TopkHeap) for add_node, filtered search, upper levels, + //! and BufferPool fallback. + //! Note: entry_point and dist will be updated to current level nearest node. void search_neighbors(level_t level, node_id_t *entry_point, dist_t *dist, - TopkHeap &topk, HnswContext *ctx) const; + TopkHeap &topk, HnswContext *ctx, bool use_pool) const; //! Update the node's neighbors void update_neighbors(HnswDistCalculator &dc, node_id_t id, level_t level, diff --git a/src/core/algorithm/hnsw/hnsw_context.h b/src/core/algorithm/hnsw/hnsw_context.h index ca2c09438..4a0e86854 100644 --- a/src/core/algorithm/hnsw/hnsw_context.h +++ b/src/core/algorithm/hnsw/hnsw_context.h @@ -14,6 +14,8 @@ #pragma once #include +#include "utility/block_heap.h" +#include "utility/linear_pool.h" #include "utility/sparse_utility.h" #include "utility/visit_filter.h" #include "hnsw_dist_calculator.h" @@ -275,6 +277,15 @@ class HnswContext : public IndexContext { return update_heap_; } + inline LinearPool &pool() { + return pool_; + } + + // Only accessed under a runtime CpuFeatures::AVX2 guard at call sites. + inline BlockHeap &block_pool() { + return block_pool_; + } + inline VisitFilter &visit_filter() { return visit_filter_; } @@ -299,6 +310,10 @@ class HnswContext : public IndexContext { ef_ = v; } + inline uint32_t ef(void) const { + return ef_; + } + inline void set_filter_mode(uint32_t v) { filter_mode_ = v; } @@ -530,6 +545,9 @@ class HnswContext : public IndexContext { uint32_t stats_get_vector_cnt_{0u}; uint32_t stats_visit_dup_cnt_{0u}; std::string preprocess_buffer_; + + LinearPool pool_; + BlockHeap block_pool_; }; } // namespace core diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc index b3c2903dc..50f15c3ff 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc @@ -844,8 +844,11 @@ const HnswEntity::Pointer HnswContiguousStreamerEntity::clone() const { } // Share contiguous memory with the clone (zero-copy) - entity->node_memory_ = node_memory_; - entity->node_base_ = node_base_; + entity->vector_memory_ = vector_memory_; + entity->vector_base_ = vector_base_; + entity->graph_memory_ = graph_memory_; + entity->graph_base_ = graph_base_; + entity->graph_stride_ = graph_stride_; entity->upper_neighbor_memory_ = upper_neighbor_memory_; entity->upper_neighbor_base_ = upper_neighbor_base_; entity->upper_chunk_offsets_ = upper_chunk_offsets_; @@ -900,8 +903,10 @@ char *HnswContiguousStreamerEntity::allocate_contiguous(size_t size) { } int HnswContiguousStreamerEntity::build_contiguous_memory() { - node_memory_.reset(); - node_base_ = nullptr; + vector_memory_.reset(); + vector_base_ = nullptr; + graph_memory_.reset(); + graph_base_ = nullptr; upper_neighbor_memory_.reset(); upper_neighbor_base_ = nullptr; upper_chunk_offsets_.clear(); @@ -911,20 +916,36 @@ int HnswContiguousStreamerEntity::build_contiguous_memory() { return 0; } - // --- Build contiguous node memory --- const size_t per_node = node_size(); - const size_t total_node_data = static_cast(total_docs) * per_node; - size_t node_memory_size = AlignHugePageSize(total_node_data); - char *raw_node = allocate_contiguous(node_memory_size); - if (!raw_node) { + const size_t vec_size = vector_size(); + // graph_stride = key + L0 neighbors (everything except vector) + graph_stride_ = sizeof(key_t) + neighbor_size_; + + // --- Allocate flat vector array (stride = vector_size) --- + const size_t total_vec_data = static_cast(total_docs) * vec_size; + size_t vector_memory_size = AlignHugePageSize(total_vec_data); + char *raw_vec = allocate_contiguous(vector_memory_size); + if (!raw_vec) { return IndexError_Runtime; } - node_memory_.reset(raw_node, ContiguousDeleter{node_memory_size}); - node_base_ = raw_node; + vector_memory_.reset(raw_vec, ContiguousDeleter{vector_memory_size}); + vector_base_ = raw_vec; + + // --- Allocate graph array (stride = sizeof(key_t) + neighbor_size) --- + const size_t total_graph_data = + static_cast(total_docs) * graph_stride_; + size_t graph_memory_size = AlignHugePageSize(total_graph_data); + char *raw_graph = allocate_contiguous(graph_memory_size); + if (!raw_graph) { + vector_memory_.reset(); + vector_base_ = nullptr; + return IndexError_Runtime; + } + graph_memory_.reset(raw_graph, ContiguousDeleter{graph_memory_size}); + graph_base_ = raw_graph; - // Copy node data from chunks into contiguous memory - // Each chunk holds node_cnt_per_chunk nodes, laid out at offset - // (id & mask) * node_size within the chunk. + // Split node data from chunks into vector and graph arrays. + // Original node layout: [vector (vec_size) | key (8B) | L0 neighbors] const auto &chunks = node_chunks_; const uint32_t nodes_per_chunk = 1U << node_index_mask_bits_; for (size_t chunk_idx = 0; chunk_idx < chunks.size(); ++chunk_idx) { @@ -932,19 +953,30 @@ int HnswContiguousStreamerEntity::build_contiguous_memory() { size_t data_size = chunks[chunk_idx]->data_size(); chunks[chunk_idx]->read(0, &chunk_data, data_size); - // Number of nodes in this chunk uint32_t base_id = chunk_idx * nodes_per_chunk; uint32_t count_in_chunk = std::min(nodes_per_chunk, total_docs - base_id); - // Copy each node's data const char *src = static_cast(chunk_data); - char *dst = node_base_ + static_cast(base_id) * per_node; - std::memcpy(dst, src, static_cast(count_in_chunk) * per_node); + for (uint32_t i = 0; i < count_in_chunk; ++i) { + const char *node_src = src + static_cast(i) * per_node; + size_t global_id = static_cast(base_id + i); + + // Copy vector to flat vector array + std::memcpy(vector_base_ + global_id * vec_size, node_src, vec_size); + + // Copy key + L0 neighbors to graph array + std::memcpy(graph_base_ + global_id * graph_stride_, node_src + vec_size, + graph_stride_); + } } // --- Build contiguous upper neighbor memory --- const auto &upper_chunks = upper_neighbor_chunks_; if (upper_chunks.empty()) { + LOG_INFO( + "Built HNSW contiguous memory (split layout): " + "vector_mem=%zu graph_mem=%zu total_docs=%u node_chunks=%zu", + vector_memory_size, graph_memory_size, total_docs, chunks.size()); return 0; } @@ -962,8 +994,10 @@ int HnswContiguousStreamerEntity::build_contiguous_memory() { size_t upper_memory_size = AlignHugePageSize(total_upper_size); char *raw_upper = allocate_contiguous(upper_memory_size); if (!raw_upper) { - node_memory_.reset(); - node_base_ = nullptr; + vector_memory_.reset(); + vector_base_ = nullptr; + graph_memory_.reset(); + graph_base_ = nullptr; return IndexError_Runtime; } upper_neighbor_memory_.reset(raw_upper, ContiguousDeleter{upper_memory_size}); @@ -979,10 +1013,11 @@ int HnswContiguousStreamerEntity::build_contiguous_memory() { } LOG_INFO( - "Built contiguous memory: node_size=%zu upper_neighbor_size=%zu " + "Built HNSW contiguous memory (split layout): " + "vector_mem=%zu graph_mem=%zu upper_neighbor_mem=%zu " "total_docs=%u node_chunks=%zu upper_chunks=%zu", - node_memory_size, upper_memory_size, total_docs, chunks.size(), - upper_chunks.size()); + vector_memory_size, graph_memory_size, upper_memory_size, total_docs, + chunks.size(), upper_chunks.size()); return 0; } diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index 4e7566171..013ad8a6f 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -217,6 +217,10 @@ class HnswStreamerEntity : public HnswEntity { return sizeof(NeighborsHeader) + upper_neighbor_cnt() * sizeof(node_id_t); } + inline size_t max_degree(level_t level) const { + return level == 0 ? neighbor_size_ : upper_neighbor_size_; + } + protected: union UpperNeighborIndexMeta { @@ -808,7 +812,8 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity { //! static_cast in the algorithm is safe. const HnswEntity::Pointer clone() const override; - inline TypedNeighbors get_neighbors_typed(level_t level, node_id_t id) const { + ailego_force_inline TypedNeighbors get_neighbors_typed(level_t level, + node_id_t id) const { if (level == 0UL) { uint32_t chunk_idx = id >> node_index_mask_bits_; uint32_t offset = @@ -831,8 +836,9 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity { return TypedNeighbors(std::move(block)); } - inline int get_vector_typed(const node_id_t *ids, uint32_t count, - std::vector &vec_blocks) const { + ailego_force_inline int get_vector_typed( + const node_id_t *ids, uint32_t count, + std::vector &vec_blocks) const { vec_blocks.resize(count); for (auto i = 0U; i < count; ++i) { uint32_t chunk_idx = ids[i] >> node_index_mask_bits_; @@ -843,7 +849,7 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity { return 0; } - inline key_t get_key_typed(node_id_t id) const { + ailego_force_inline key_t get_key_typed(node_id_t id) const { if (!use_key_info_map_) { return id; } @@ -853,9 +859,18 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity { return *reinterpret_cast(base + offset); } - private: + //! Direct vector pointer access (no MemoryBlock wrapper). + //! For use in the merged search loop to avoid intermediate allocations. + ailego_force_inline const void *get_vector_ptr(node_id_t id) const { + uint32_t chunk_idx = id >> node_index_mask_bits_; + uint32_t offset = (id & node_index_mask_) * node_size(); + return get_node_chunk_base(chunk_idx) + offset; + } + + protected: //! Get cached base address for a node chunk, syncing if needed - inline const char *get_node_chunk_base(uint32_t chunk_idx) const { + ailego_force_inline const char *get_node_chunk_base( + uint32_t chunk_idx) const { if (ailego_unlikely(chunk_idx >= node_chunk_bases_.size())) { sync_node_chunk_bases(chunk_idx); } @@ -863,7 +878,8 @@ class HnswMmapStreamerEntity : public HnswStreamerEntity { } //! Get cached base address for an upper neighbor chunk, syncing if needed - inline const char *get_upper_neighbor_chunk_base(uint32_t chunk_idx) const { + ailego_force_inline const char *get_upper_neighbor_chunk_base( + uint32_t chunk_idx) const { if (ailego_unlikely(chunk_idx >= upper_neighbor_chunk_bases_.size())) { sync_upper_neighbor_chunk_bases(chunk_idx); } @@ -926,9 +942,11 @@ class HnswBufferPoolStreamerEntity : public HnswStreamerEntity { }; //! Typed entity subclass for contiguous memory mode. -//! Allocates contiguous memory (with hugepage/THP support) and copies all -//! chunk data into it. Access is via a single base pointer + offset, -//! eliminating chunk-level indirection and maximizing memory locality. +//! Splits node data into two dense arrays during build: +//! 1. vector_base_: flat vector array (stride = vector_size) +//! 2. graph_base_: key + L0 neighbors (stride = graph_stride_) +//! Total memory = vector_size + graph_stride_ per node (same as original +//! node_size), but each access pattern gets optimal cache locality. class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity { public: using HnswMmapStreamerEntity::HnswMmapStreamerEntity; @@ -950,8 +968,10 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity { //! Degrade to mmap mode by releasing contiguous memory and falling back //! to chunk-based access. void degrade_to_mmap() { - node_memory_.reset(); - node_base_ = nullptr; + vector_memory_.reset(); + vector_base_ = nullptr; + graph_memory_.reset(); + graph_base_ = nullptr; upper_neighbor_memory_.reset(); upper_neighbor_base_ = nullptr; upper_chunk_offsets_.clear(); @@ -959,7 +979,7 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity { } bool is_contiguous() const { - return node_base_ != nullptr; + return vector_base_ != nullptr; } int add_vector(level_t level, key_t key, const void *vec, @@ -974,11 +994,14 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity { return HnswMmapStreamerEntity::add_vector_with_id(level, id, vec); } - inline TypedNeighbors get_neighbors_typed(level_t level, node_id_t id) const { - if (ailego_likely(node_base_ != nullptr)) { + ailego_force_inline TypedNeighbors get_neighbors_typed(level_t level, + node_id_t id) const { + if (ailego_likely(graph_base_ != nullptr)) { if (level == 0UL) { - const char *ptr = node_base_ + static_cast(id) * node_size() + - vector_size() + sizeof(key_t); + // graph layout: [key (sizeof(key_t)) | NeighborsHeader + neighbors] + const char *ptr = graph_base_ + + static_cast(id) * graph_stride_ + + sizeof(key_t); MmapMemoryBlock block(const_cast(ptr)); return TypedNeighbors(std::move(block)); } @@ -999,13 +1022,14 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity { return HnswMmapStreamerEntity::get_neighbors_typed(level, id); } - inline int get_vector_typed(const node_id_t *ids, uint32_t count, - std::vector &vec_blocks) const { - if (ailego_likely(node_base_ != nullptr)) { + ailego_force_inline int get_vector_typed( + const node_id_t *ids, uint32_t count, + std::vector &vec_blocks) const { + if (ailego_likely(vector_base_ != nullptr)) { vec_blocks.resize(count); for (auto i = 0U; i < count; ++i) { const char *ptr = - node_base_ + static_cast(ids[i]) * node_size(); + vector_base_ + static_cast(ids[i]) * vector_size(); vec_blocks[i].reset(const_cast(ptr)); } return 0; @@ -1013,18 +1037,29 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity { return HnswMmapStreamerEntity::get_vector_typed(ids, count, vec_blocks); } - inline key_t get_key_typed(node_id_t id) const { - if (ailego_likely(node_base_ != nullptr)) { + ailego_force_inline key_t get_key_typed(node_id_t id) const { + if (ailego_likely(graph_base_ != nullptr)) { if (!use_key_info_map_) { return id; } - const char *ptr = - node_base_ + static_cast(id) * node_size() + vector_size(); + const char *ptr = graph_base_ + static_cast(id) * graph_stride_; return *reinterpret_cast(ptr); } return HnswMmapStreamerEntity::get_key_typed(id); } + //! Direct vector pointer from flat vector array (stride = vector_size). + //! For use in the merged search loop to avoid intermediate allocations. + ailego_force_inline const void *get_vector_ptr(node_id_t id) const { + if (ailego_likely(vector_base_ != nullptr)) { + return vector_base_ + static_cast(id) * vector_size(); + } + // Fallback to mmap chunk-based access + uint32_t chunk_idx = id >> node_index_mask_bits_; + uint32_t offset = (id & node_index_mask_) * node_size(); + return get_node_chunk_base(chunk_idx) + offset; + } + protected: //! Custom deleter for contiguous memory (munmap / _aligned_free / free) //! Used by shared_ptr to properly release mmap'd memory. @@ -1042,12 +1077,17 @@ class HnswContiguousStreamerEntity : public HnswMmapStreamerEntity { } }; - //! Shared ownership of contiguous memory (enables zero-copy clone) - std::shared_ptr node_memory_{}; - std::shared_ptr upper_neighbor_memory_{}; + //! Flat vector array: vectors stored densely (stride = vector_size). + std::shared_ptr vector_memory_{}; + char *vector_base_{nullptr}; + + //! Graph array: [key | L0 neighbors] stored densely (stride = graph_stride_). + std::shared_ptr graph_memory_{}; + char *graph_base_{nullptr}; + size_t graph_stride_{0}; // sizeof(key_t) + neighbor_size_ - //! Raw pointers for hot-path access (derived from shared_ptr) - char *node_base_{nullptr}; + //! Shared ownership of upper neighbor contiguous memory + std::shared_ptr upper_neighbor_memory_{}; char *upper_neighbor_base_{nullptr}; //! Cumulative offsets for each upper neighbor chunk in contiguous memory diff --git a/src/core/algorithm/vamana/vamana_algorithm.cc b/src/core/algorithm/vamana/vamana_algorithm.cc index a57d36bce..dfd4cc66e 100644 --- a/src/core/algorithm/vamana/vamana_algorithm.cc +++ b/src/core/algorithm/vamana/vamana_algorithm.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "vamana_algorithm.h" +#include +#include namespace zvec { namespace core { @@ -48,7 +50,8 @@ int VamanaAlgorithm::add_node(node_id_t id, VamanaContext *ctx) { ctx->topk_heap().limit(search_list_size); ctx->dist_calculator().clear_compare_cnt(); - // Set query to the new node's vector + // Set query to the new node's vector. Use reset_query (same as search path) + // so that greedy_search works with the search-optimized distance kernel. const void *query_vec = entity_.get_vector(id); if (ailego_unlikely(query_vec == nullptr)) { LOG_ERROR("Failed to get vector for node %u", id); @@ -56,10 +59,11 @@ int VamanaAlgorithm::add_node(node_id_t id, VamanaContext *ctx) { } ctx->reset_query(query_vec); - greedy_search(entry_point, ctx); + greedy_search(entry_point, ctx, /*use_pool=*/false); - // Step 2: RobustPrune to select diverse neighbors auto &topk_heap = ctx->topk_heap(); + + // Step 2: RobustPrune to select diverse neighbors robust_prune(id, topk_heap, entity_.alpha(), entity_.max_degree(), ctx); // Copy result before reverse updates (which also call robust_prune) auto pruned_neighbors = ctx->prune_result(); @@ -96,38 +100,119 @@ int VamanaAlgorithm::search(VamanaContext *ctx) const { uint32_t ef_search = std::max(static_cast(ctx->topk()), ctx->ef()); topk_heap.limit(ef_search); - greedy_search(entry_point, ctx); + greedy_search(entry_point, ctx, /*use_pool=*/true); return 0; } // ============================================================================ -// greedy_search: Beam search from entry_point. +// greedy_search helper templates // -// Maintains a candidate min-heap (ordered by distance) and a visited set. -// At each step, pops the closest unvisited candidate, expands its neighbors, -// and adds unvisited neighbors to both the candidate heap and the topk heap. -// Stops when the closest candidate is farther than the worst in topk, or -// when the scan limit is reached. +// Two specialized inner loops, dispatched from greedy_search(): +// +// fast_greedy_search: mmap/contiguous with direct vector pointers. +// Uses batch_dist on a pointer array. +// slow_greedy_search: BufferPool-backed storage: must fetch MemBlock +// wrappers via get_vector_typed to pin pages. +// +// Both accept either BlockHeap or LinearPool as `HeapType` because the +// two expose the same reset(n, ef, block_size) / push_block(dists, ids, n) +// surface (LinearPool adapts via push_block and ignores the block_size hint). // ============================================================================ -template -void VamanaAlgorithm::greedy_search(node_id_t entry_point, - VamanaContext *ctx) const { - const auto &entity = static_cast(ctx->get_entity()); - VamanaDistCalculator &dc = ctx->dist_calculator(); - VisitFilter &visit = ctx->visit_filter(); - CandidateHeap &candidates = ctx->candidates(); - auto &topk_heap = ctx->topk_heap(); - const IndexFilter &index_filter = - static_cast(ctx)->filter(); - std::function filter = [](node_id_t) { return false; }; - if (index_filter.is_valid()) { - filter = [&](node_id_t id) { - return index_filter(entity.get_key_typed(id)); - }; +// mmap/contiguous variant: resolve vectors via get_vector_ptr +// and dispatch to the classic pointer-array batch_dist. +template +void fast_greedy_search(const EntityType &entity, HeapType &pool, + VisitFilter &visit, VamanaDistCalculator &dc, + uint32_t topk, uint32_t ef, node_id_t entry_point, + uint32_t prefetch_lines) { + const uint32_t max_deg = entity.max_degree(); + const uint32_t cap = std::max(topk, ef); + pool.reset(static_cast(cap), static_cast(max_deg)); + visit.clear(); + + dist_t ep_dist = dc.batch_dist(entry_point); + visit.set_visited(entry_point); + pool.push_block(&ep_dist, &entry_point, 1); + + static constexpr uint32_t GRAPH_PO = 8; + + uint32_t buf_capacity = max_deg; + std::vector neighbor_ids(buf_capacity); + std::vector dists(buf_capacity); + std::vector neighbor_vecs(buf_capacity); + + while (pool.has_next()) { + auto current_node = pool.pop(); + + const auto neighbors = entity.get_neighbors_typed(current_node); + ailego_prefetch(neighbors.data); + + if (neighbors.size() > buf_capacity) { + buf_capacity = neighbors.size(); + neighbor_ids.resize(buf_capacity); + dists.resize(buf_capacity); + neighbor_vecs.resize(buf_capacity); + } + + const uint32_t po = + std::min(static_cast(neighbors.size()), GRAPH_PO); + uint32_t unvisited_count = 0; + uint32_t i = 0; + + for (; i < po; ++i) { + node_id_t node = neighbors[i]; + if (visit.visited(node)) continue; + visit.set_visited(node); + const void *vec_ptr = entity.get_vector_ptr(node); + const char *p = reinterpret_cast(vec_ptr); + for (uint32_t cl = 0; cl < prefetch_lines; ++cl) { + ailego_prefetch(p + cl * 64); + } + neighbor_ids[unvisited_count] = node; + neighbor_vecs[unvisited_count] = vec_ptr; + unvisited_count++; + } + for (; i < neighbors.size(); ++i) { + node_id_t node = neighbors[i]; + if (visit.visited(node)) continue; + visit.set_visited(node); + neighbor_ids[unvisited_count] = node; + neighbor_vecs[unvisited_count] = entity.get_vector_ptr(node); + unvisited_count++; + } + + if (unvisited_count == 0) continue; + dc.batch_dist(neighbor_vecs.data(), unvisited_count, dists.data()); + pool.push_block(dists.data(), neighbor_ids.data(), + static_cast(unvisited_count)); } +} + +// ============================================================================ +// dual_heap_greedy_search: shared core for the fallback dual-heap path. +// +// Maintains a candidate min-heap + topk heap + VisitFilter. Uses plain +// batch_dist. +// ============================================================================ +template +void dual_heap_greedy_search(const EntityType &entity, VamanaContext *ctx, + VamanaDistCalculator &dc, node_id_t entry_point, + FilterFn &&filter) { + static constexpr uint32_t PREFETCH_BATCH = 2; + static constexpr uint32_t PREFETCH_STEP = 2; + + uint32_t buf_capacity = entity.max_degree(); + std::vector neighbor_ids(buf_capacity); + std::vector neighbor_vec_blocks; + neighbor_vec_blocks.reserve(buf_capacity); + std::vector dists(buf_capacity); + std::vector neighbor_vecs(buf_capacity); + VisitFilter &visit = ctx->visit_filter(); + CandidateHeap &candidates = ctx->candidates(); + auto &topk_heap = ctx->topk_heap(); candidates.clear(); visit.clear(); @@ -147,16 +232,6 @@ void VamanaAlgorithm::greedy_search(node_id_t entry_point, } candidates.emplace(entry_point, entry_dist); - // Pre-allocate temporary vectors outside the hot loop to avoid - // per-iteration heap allocations. Sized to max_degree initially; - // resized inside the loop if actual neighbor count exceeds this. - uint32_t buf_capacity = entity.max_degree(); - std::vector neighbor_ids(buf_capacity); - std::vector neighbor_vec_blocks; - neighbor_vec_blocks.reserve(buf_capacity); - std::vector dists(buf_capacity); - std::vector neighbor_vecs(buf_capacity); - while (!candidates.empty() && !ctx->reach_scan_limit()) { auto top = candidates.begin(); node_id_t current_node = top->first; @@ -198,28 +273,17 @@ void VamanaAlgorithm::greedy_search(node_id_t entry_point, neighbor_vec_blocks); if (ailego_unlikely(ret != 0)) break; - // Prefetch for better cache performance - static constexpr uint32_t PREFETCH_BATCH = 2; - static constexpr uint32_t PREFETCH_STEP = 2; for (uint32_t i = 0; i < std::min(PREFETCH_BATCH * PREFETCH_STEP, unvisited_count); ++i) { ailego_prefetch(neighbor_vec_blocks[i].data()); } - // Batch distance computation (reuse pre-allocated buffers) + // Batch distance computation (reuse pre-allocated buffers). for (uint32_t i = 0; i < unvisited_count; ++i) { neighbor_vecs[i] = neighbor_vec_blocks[i].data(); } dc.batch_dist(neighbor_vecs.data(), unvisited_count, dists.data()); - // Update candidates and topk. - // Unlike vanilla DiskANN which inserts all unvisited neighbors into - // the candidate queue unconditionally, we apply an early-pruning - // optimization: a neighbor is only inserted into the candidate queue - // (and topk_heap) if it could potentially improve the final results, - // i.e. either the topk heap is not yet full, or the neighbor is closer - // than the current worst result. This avoids expanding clearly - // unpromising branches and reduces the candidate queue size. for (uint32_t i = 0; i < unvisited_count; ++i) { node_id_t node = neighbor_ids[i]; dist_t node_dist = dists[i]; @@ -233,6 +297,88 @@ void VamanaAlgorithm::greedy_search(node_id_t entry_point, } } +// ============================================================================ +// greedy_search: Beam search from entry_point. +// +// Maintains a candidate min-heap (ordered by distance) and a visited set. +// At each step, pops the closest unvisited candidate, expands its neighbors, +// and adds unvisited neighbors to both the candidate heap and the topk heap. +// Stops when the closest candidate is farther than the worst in topk, or +// when the scan limit is reached. +// ============================================================================ +template +void VamanaAlgorithm::greedy_search(node_id_t entry_point, + VamanaContext *ctx, + bool use_pool) const { + const auto &entity = static_cast(ctx->get_entity()); + VamanaDistCalculator &dc = ctx->dist_calculator(); + + const IndexFilter &index_filter = + static_cast(ctx)->filter(); + + // Number of cache lines per vector (e.g. 2 for dim=128). + // Used by both the fallback candidates/filter path and the fast helpers. + uint32_t prefetch_lines = (dc.dimension() + 63) / 64; + if constexpr (std::is_same_v) { + // Contiguous flat array stride is already 64B-aligned. Use it so that + // prefetch does not overshoot into the next vector. + size_t stride = entity.vector_stride(); + if (stride > 0) { + prefetch_lines = static_cast(stride / 64); + } + } + + if (!use_pool || index_filter.is_valid()) { + // Fallback path used by add_node (use_pool=false) and filtered search. + // Dispatched to dual_heap_greedy_search (plain batch_dist). + auto run_with_filter = [&](auto &&filter) { + dual_heap_greedy_search( + entity, ctx, dc, entry_point, std::forward(filter)); + }; + + if (index_filter.is_valid()) { + auto filter = [&](node_id_t id) { + return index_filter(entity.get_key_typed(id)); + }; + run_with_filter(filter); + } else { + auto filter = [](node_id_t) { return false; }; + run_with_filter(filter); + } + } else { + // Fast pool-based path for mmap/contiguous entities that support + // direct pointer access. BlockHeap (AVX2) or LinearPool (scalar) + // are used for top-k tracking. BufferPool entities fall back to + // dual_heap_greedy_search since they lack direct pointer access. + if constexpr (std::is_same_v) { + const uint32_t topk_v = static_cast(ctx->topk()); + const uint32_t ef_v = ctx->ef(); + const bool avx2_ok = + zvec::ailego::internal::CpuFeatures::static_flags_.AVX2; + auto &topk_heap = ctx->topk_heap(); + + auto &visit = ctx->visit_filter(); + + if (avx2_ok) { + auto &bpool = ctx->block_pool(); + fast_greedy_search(entity, bpool, visit, dc, topk_v, ef_v, entry_point, + prefetch_lines); + copy_pool_to_topk(bpool, topk_heap); + } else { + auto &lpool = ctx->pool(); + fast_greedy_search(entity, lpool, visit, dc, topk_v, ef_v, entry_point, + prefetch_lines); + copy_pool_to_topk(lpool, topk_heap); + } + } else { + // BufferPool entities: fallback to dual-heap path. + auto filter = [](node_id_t) { return false; }; + dual_heap_greedy_search(entity, ctx, dc, + entry_point, filter); + } + } +} + // ============================================================================ // robust_prune: Select up to max_degree diverse neighbors from candidates. // @@ -334,9 +480,11 @@ void VamanaAlgorithm::robust_prune(node_id_t id, } if (batch_count > 0) { - // Batch compute distances from selected candidate to remaining - dc.batch_dist_pair(selected_vec, batch_vecs.data(), batch_count, - batch_dists.data()); + // Compute distances from selected candidate to remaining candidates. + // distance_ is the symmetric data-to-data kernel (no pairwise split). + for (uint32_t k = 0; k < batch_count; ++k) { + batch_dists[k] = dc.dist(selected_vec, batch_vecs[k]); + } // DiskANN (L2/Cosine): // occlude_factor[t] = max(occlude_factor[t], dist_to_query / diff --git a/src/core/algorithm/vamana/vamana_algorithm.h b/src/core/algorithm/vamana/vamana_algorithm.h index c5b2ce9e7..d8e0c9301 100644 --- a/src/core/algorithm/vamana/vamana_algorithm.h +++ b/src/core/algorithm/vamana/vamana_algorithm.h @@ -80,7 +80,8 @@ class VamanaAlgorithm : public VamanaAlgorithmBase { // GreedySearch: starting from entry_point, greedily expand the closest // unvisited candidate until the search list is exhausted or scan limit // is reached. Results accumulate in topk_heap. - void greedy_search(node_id_t entry_point, VamanaContext *ctx) const; + void greedy_search(node_id_t entry_point, VamanaContext *ctx, + bool use_pool) const; // RobustPrune: given a candidate set (topk_heap), select up to max_degree // diverse neighbors using alpha-based distance comparison. diff --git a/src/core/algorithm/vamana/vamana_context.h b/src/core/algorithm/vamana/vamana_context.h index 38cc8457e..b3cacae31 100644 --- a/src/core/algorithm/vamana/vamana_context.h +++ b/src/core/algorithm/vamana/vamana_context.h @@ -14,6 +14,8 @@ #pragma once #include +#include "utility/block_heap.h" +#include "utility/linear_pool.h" #include "utility/visit_filter.h" #include "vamana_dist_calculator.h" #include "vamana_entity.h" @@ -119,6 +121,14 @@ class VamanaContext : public IndexContext { inline TopkHeap &update_heap() { return update_heap_; } + inline LinearPool &pool() { + return pool_; + } + // Block-insert pool used by the AVX2-gated greedy_search fast path. + // Only accessed under a runtime CpuFeatures::AVX2 guard at call sites. + inline BlockHeap &block_pool() { + return block_pool_; + } inline VisitFilter &visit_filter() { return visit_filter_; } @@ -256,12 +266,6 @@ class VamanaContext : public IndexContext { return topk_; } - inline void update_dist_caculator_distance( - const IndexMetric::MatrixDistance &distance, - const IndexMetric::MatrixBatchDistance &batch_distance) { - dc_.update_distance(distance, batch_distance); - } - private: void fill_random_to_topk_full(void); @@ -320,6 +324,9 @@ class VamanaContext : public IndexContext { VisitFilter::Mode filter_mode_{VisitFilter::ByteMap}; float filter_negative_prob_{VamanaEntity::kDefaultBFNegativeProbability}; + + LinearPool pool_; + BlockHeap block_pool_; }; } // namespace core diff --git a/src/core/algorithm/vamana/vamana_dist_calculator.h b/src/core/algorithm/vamana/vamana_dist_calculator.h index c78f63810..ed15b0bdf 100644 --- a/src/core/algorithm/vamana/vamana_dist_calculator.h +++ b/src/core/algorithm/vamana/vamana_dist_calculator.h @@ -65,13 +65,6 @@ class VamanaDistCalculator { dim_ = dim; } - inline void update_distance( - const IndexMetric::MatrixDistance &distance, - const IndexMetric::MatrixBatchDistance &batch_distance) { - distance_ = distance; - batch_distance_ = batch_distance; - } - inline void reset_query(const void *query) { error_ = false; query_ = query; @@ -136,22 +129,6 @@ class VamanaDistCalculator { return score; } - // Batch distance computation between a base vector and multiple target - // vectors. Does NOT use query_ and does NOT increment compare_cnt. Used for - // inter-candidate distance computation in robust_prune. - // - // Uses the single distance function (distance_) in a loop rather than - // batch_distance_, because batch_distance_ (turbo AVX512-VNNI) expects - // the second argument to be a preprocessed uint8 query (+128 shift), - // while base_vec here is a raw int8 stored vector. The single distance - // function (AVX2 sign/abs trick) correctly handles two raw int8 inputs. - inline void batch_dist_pair(const void *base_vec, const void **vecs, - uint32_t count, float *dists) { - for (uint32_t i = 0; i < count; ++i) { - distance_(base_vec, vecs[i], dim_, &dists[i]); - } - } - dist_t operator()(const void *vec) { return dist(vec); } diff --git a/src/core/algorithm/vamana/vamana_streamer.cc b/src/core/algorithm/vamana/vamana_streamer.cc index d03163c98..0c08a8238 100644 --- a/src/core/algorithm/vamana/vamana_streamer.cc +++ b/src/core/algorithm/vamana/vamana_streamer.cc @@ -235,6 +235,12 @@ int VamanaStreamer::open(IndexStorage::Pointer stg) { auto metric_params = index_meta.metric_params(); metric_params.merge(meta_.metric_params()); meta_.set_metric(index_meta.metric_name(), 0, metric_params); + // Propagate reformer info from stored meta (needed for quantizers + // whose reformer params are computed during training, e.g. UniformInt8) + if (!index_meta.reformer_name().empty()) { + meta_.set_reformer(index_meta.reformer_name(), 0, + index_meta.reformer_params()); + } } // Create metric @@ -256,17 +262,6 @@ int VamanaStreamer::open(IndexStorage::Pointer stg) { return IndexError_InvalidArgument; } - add_distance_ = metric_->distance(); - add_batch_distance_ = metric_->batch_distance(); - search_distance_ = add_distance_; - search_batch_distance_ = add_batch_distance_; - - if (metric_->query_metric() && metric_->query_metric()->distance() && - metric_->query_metric()->batch_distance()) { - search_distance_ = metric_->query_metric()->distance(); - search_batch_distance_ = metric_->query_metric()->batch_distance(); - } - // Create algorithm based on entity storage mode switch (entity_->storage_mode()) { case VamanaStorageMode::kBufferPool: @@ -456,8 +451,6 @@ int VamanaStreamer::add_impl(uint64_t pkey, const void *query, AILEGO_DEFER([&]() { shared_mutex_.unlock_shared(); }); ctx->clear(); - ctx->update_dist_caculator_distance(add_distance_, add_batch_distance_); - ctx->reset_query(query); ctx->check_need_adjuct_ctx(entity_->doc_cnt()); if (metric_->support_train()) { @@ -529,8 +522,6 @@ int VamanaStreamer::add_with_id_impl(uint32_t id, const void *query, AILEGO_DEFER([&]() { shared_mutex_.unlock_shared(); }); ctx->clear(); - ctx->update_dist_caculator_distance(add_distance_, add_batch_distance_); - ctx->reset_query(query); ctx->check_need_adjuct_ctx(entity_->doc_cnt()); if (metric_->support_train()) { @@ -593,7 +584,6 @@ int VamanaStreamer::search_impl(const void *query, const IndexQueryMeta &qmeta, } ctx->clear(); - ctx->update_dist_caculator_distance(search_distance_, search_batch_distance_); ctx->resize_results(count); ctx->check_need_adjuct_ctx(entity_->doc_cnt()); @@ -655,7 +645,6 @@ int VamanaStreamer::search_bf_impl(const void *query, } ctx->clear(); - ctx->update_dist_caculator_distance(search_distance_, search_batch_distance_); ctx->resize_results(count); const auto &filter = static_cast(ctx)->filter(); @@ -697,7 +686,6 @@ int VamanaStreamer::search_bf_by_p_keys_impl( } ctx->clear(); - ctx->update_dist_caculator_distance(search_distance_, search_batch_distance_); ctx->resize_results(count); auto &topk = ctx->topk_heap(); diff --git a/src/core/algorithm/vamana/vamana_streamer.h b/src/core/algorithm/vamana/vamana_streamer.h index 3446c73ed..a1217c9ac 100644 --- a/src/core/algorithm/vamana/vamana_streamer.h +++ b/src/core/algorithm/vamana/vamana_streamer.h @@ -150,11 +150,6 @@ class VamanaStreamer : public IndexStreamer { IndexMeta meta_{}; IndexMetric::Pointer metric_{}; - IndexMetric::MatrixDistance add_distance_{}; - IndexMetric::MatrixDistance search_distance_{}; - IndexMetric::MatrixBatchDistance add_batch_distance_{}; - IndexMetric::MatrixBatchDistance search_batch_distance_{}; - Stats stats_{}; std::mutex mutex_{}; diff --git a/src/core/algorithm/vamana/vamana_streamer_entity.cc b/src/core/algorithm/vamana/vamana_streamer_entity.cc index b9bb38145..4324ea2b8 100644 --- a/src/core/algorithm/vamana/vamana_streamer_entity.cc +++ b/src/core/algorithm/vamana/vamana_streamer_entity.cc @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "vamana_streamer_entity.h" -#if defined(__linux__) || defined(__APPLE__) -#include -#endif #include #include #include @@ -540,8 +537,12 @@ const VamanaEntity::Pointer VamanaContiguousStreamerEntity::clone() const { } // Share contiguous memory with the clone (zero-copy) - entity->node_memory_ = node_memory_; - entity->node_base_ = node_base_; + entity->vector_memory_ = vector_memory_; + entity->vector_base_ = vector_base_; + entity->vector_stride_ = vector_stride_; + entity->graph_memory_ = graph_memory_; + entity->graph_base_ = graph_base_; + entity->graph_stride_ = graph_stride_; return VamanaEntity::Pointer(entity); } @@ -552,56 +553,60 @@ const VamanaEntity::Pointer VamanaContiguousStreamerEntity::clone() const { char *VamanaContiguousStreamerEntity::allocate_contiguous(size_t size) { if (size == 0) return nullptr; -#if defined(__linux__) - void *ptr = ::mmap(nullptr, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (ptr == MAP_FAILED) { - LOG_ERROR("mmap failed for contiguous memory, size=%zu", size); - return nullptr; - } - ::madvise(ptr, size, MADV_HUGEPAGE); - return static_cast(ptr); -#elif defined(__APPLE__) - void *ptr = ::mmap(nullptr, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); - if (ptr == MAP_FAILED) { - LOG_ERROR("mmap failed for contiguous memory, size=%zu", size); - return nullptr; - } - return static_cast(ptr); -#elif defined(_WIN32) - void *ptr = ::_aligned_malloc(size, ailego::MemoryHelper::PageSize()); - if (!ptr) { - LOG_ERROR("_aligned_malloc failed for contiguous memory, size=%zu", size); - return nullptr; - } - return static_cast(ptr); -#else - void *ptr = std::aligned_alloc(ailego::MemoryHelper::PageSize(), size); + void *ptr = ailego::MemoryHelper::AllocateHugePage(size); if (!ptr) { - LOG_ERROR("aligned_alloc failed, size=%zu", size); + LOG_ERROR("AllocateHugePage failed for contiguous memory, size=%zu", size); return nullptr; } return static_cast(ptr); -#endif } int VamanaContiguousStreamerEntity::build_contiguous_memory() { - node_memory_.reset(); - node_base_ = nullptr; + vector_memory_.reset(); + vector_base_ = nullptr; + vector_stride_ = 0; + graph_memory_.reset(); + graph_base_ = nullptr; const uint32_t total_docs = doc_cnt(); if (total_docs == 0) return 0; const size_t per_node = node_size(); - const size_t total_node_data = static_cast(total_docs) * per_node; - size_t node_memory_size = AlignHugePageSize(total_node_data); - char *raw_node = allocate_contiguous(node_memory_size); - if (!raw_node) return IndexError_Runtime; - node_memory_.reset(raw_node, ContiguousDeleter{node_memory_size}); - node_base_ = raw_node; - - // Copy node data from chunks into contiguous memory + const size_t vec_size = vector_size(); + + // Pad per-vector stride up to kVectorAlignment (64B) so every vector + // starts on a cache-line boundary. + vector_stride_ = + (vec_size + (kVectorAlignment - 1)) & ~(kVectorAlignment - 1); + // graph_stride = key + neighbors (everything except vector) + graph_stride_ = sizeof(key_t) + neighbors_size(); + + // Allocate flat vector array (stride = vector_stride_, padded for 64B) + const size_t total_vec_data = + static_cast(total_docs) * vector_stride_; + size_t vector_memory_size = AlignHugePageSize(total_vec_data); + char *raw_vec = allocate_contiguous(vector_memory_size); + if (!raw_vec) return IndexError_Runtime; + vector_memory_.reset(raw_vec, ContiguousDeleter{vector_memory_size}); + vector_base_ = raw_vec; + + // Allocate graph array (stride = sizeof(key_t) + neighbors_size) + const size_t total_graph_data = + static_cast(total_docs) * graph_stride_; + size_t graph_memory_size = AlignHugePageSize(total_graph_data); + char *raw_graph = allocate_contiguous(graph_memory_size); + if (!raw_graph) { + vector_memory_.reset(); + vector_base_ = nullptr; + vector_stride_ = 0; + return IndexError_Runtime; + } + graph_memory_.reset(raw_graph, ContiguousDeleter{graph_memory_size}); + graph_base_ = raw_graph; + + // Split node data from chunks into vector / graph arrays. + // Original node layout: [vector (vec_size) | key (8B) | neighbors] + // Padding bytes in vector_base_ are left zero (anon mmap is zero-filled). const auto &chunks = node_chunks_; const uint32_t nodes_per_chunk = 1U << node_index_mask_bits_; for (size_t chunk_idx = 0; chunk_idx < chunks.size(); ++chunk_idx) { @@ -613,14 +618,27 @@ int VamanaContiguousStreamerEntity::build_contiguous_memory() { uint32_t count_in_chunk = std::min(nodes_per_chunk, total_docs - base_id); const char *src = static_cast(chunk_data); - char *dst = node_base_ + static_cast(base_id) * per_node; - std::memcpy(dst, src, static_cast(count_in_chunk) * per_node); + for (uint32_t i = 0; i < count_in_chunk; ++i) { + const char *node_src = src + static_cast(i) * per_node; + size_t global_id = static_cast(base_id + i); + + // Copy vector to flat vector array at padded stride + std::memcpy(vector_base_ + global_id * vector_stride_, node_src, + vec_size); + + // Copy key + neighbors to graph array + std::memcpy(graph_base_ + global_id * graph_stride_, node_src + vec_size, + graph_stride_); + } } LOG_INFO( - "Built Vamana contiguous memory: node_size=%zu total_docs=%u " - "node_chunks=%zu", - node_memory_size, total_docs, chunks.size()); + "Built Vamana contiguous memory: " + "vector_mem=%zu graph_mem=%zu total_docs=%u " + "node_chunks=%zu vector_size=%zu vector_stride=%zu " + "(cache-line aligned to %zuB)", + vector_memory_size, graph_memory_size, total_docs, chunks.size(), + vec_size, vector_stride_, kVectorAlignment); return 0; } diff --git a/src/core/algorithm/vamana/vamana_streamer_entity.h b/src/core/algorithm/vamana/vamana_streamer_entity.h index 59652f2f0..ecc3f70b4 100644 --- a/src/core/algorithm/vamana/vamana_streamer_entity.h +++ b/src/core/algorithm/vamana/vamana_streamer_entity.h @@ -16,10 +16,8 @@ #include #include #include -#if defined(__linux__) || defined(__APPLE__) -#include -#endif #include +#include #include #include #include @@ -478,7 +476,7 @@ class VamanaMmapStreamerEntity : public VamanaStreamerEntity { //! static_cast in the algorithm is safe. const VamanaEntity::Pointer clone() const override; - inline TypedNeighbors get_neighbors_typed(node_id_t id) const { + ailego_force_inline TypedNeighbors get_neighbors_typed(node_id_t id) const { uint32_t chunk_idx = id >> node_index_mask_bits_; uint32_t offset = (id & node_index_mask_) * node_size() + vector_size() + sizeof(key_t); @@ -487,8 +485,9 @@ class VamanaMmapStreamerEntity : public VamanaStreamerEntity { return TypedNeighbors(std::move(block)); } - inline int get_vector_typed(const node_id_t *ids, uint32_t count, - std::vector &vec_blocks) const { + ailego_force_inline int get_vector_typed( + const node_id_t *ids, uint32_t count, + std::vector &vec_blocks) const { vec_blocks.resize(count); for (auto i = 0U; i < count; ++i) { uint32_t chunk_idx = ids[i] >> node_index_mask_bits_; @@ -499,7 +498,7 @@ class VamanaMmapStreamerEntity : public VamanaStreamerEntity { return 0; } - inline key_t get_key_typed(node_id_t id) const { + ailego_force_inline key_t get_key_typed(node_id_t id) const { if (!use_key_info_map_) return id; uint32_t chunk_idx = id >> node_index_mask_bits_; uint32_t offset = (id & node_index_mask_) * node_size() + vector_size(); @@ -507,8 +506,17 @@ class VamanaMmapStreamerEntity : public VamanaStreamerEntity { return *reinterpret_cast(base + offset); } + //! Direct vector pointer access (no MemoryBlock wrapper). + //! For use in the merged search loop to avoid intermediate allocations. + ailego_force_inline const void *get_vector_ptr(node_id_t id) const { + uint32_t chunk_idx = id >> node_index_mask_bits_; + uint32_t offset = (id & node_index_mask_) * node_size(); + return get_node_chunk_base(chunk_idx) + offset; + } + private: - inline const char *get_node_chunk_base(uint32_t chunk_idx) const { + ailego_force_inline const char *get_node_chunk_base( + uint32_t chunk_idx) const { if (ailego_unlikely(chunk_idx >= node_chunk_bases_.size())) { sync_node_chunk_bases(chunk_idx); } @@ -569,9 +577,11 @@ class VamanaBufferPoolStreamerEntity : public VamanaStreamerEntity { }; // --- Typed entity subclass for contiguous memory mode --- -// Allocates contiguous memory and copies all chunk data into it. -// Access is via a single base pointer + offset, eliminating chunk-level -// indirection and maximizing memory locality. +// Splits node data into two dense arrays during build: +// 1. vector_base_: flat vector array (stride = vector_size) +// 2. graph_base_: key + neighbors (stride = graph_stride_) +// Total memory = vector_size + graph_stride_ per node (same as original +// node_size), but each access pattern gets optimal cache locality. class VamanaContiguousStreamerEntity : public VamanaMmapStreamerEntity { public: using VamanaMmapStreamerEntity::VamanaMmapStreamerEntity; @@ -592,13 +602,23 @@ class VamanaContiguousStreamerEntity : public VamanaMmapStreamerEntity { //! Degrade to mmap mode by releasing contiguous memory and falling back //! to chunk-based access. void degrade_to_mmap() { - node_memory_.reset(); - node_base_ = nullptr; + vector_memory_.reset(); + vector_base_ = nullptr; + vector_stride_ = 0; + graph_memory_.reset(); + graph_base_ = nullptr; LOG_INFO("Vamana contiguous entity degraded to mmap mode for insertion"); } bool is_contiguous() const { - return node_base_ != nullptr; + return vector_base_ != nullptr; + } + + //! Per-entry stride of the flat vector array (0 if no contiguous build). + //! Padded up to kVectorAlignment (64B), so it is also the amount that + //! should be prefetched per vector. + size_t vector_stride() const { + return vector_stride_; } int add_vector(key_t key, const void *vec, node_id_t *id) override { @@ -611,23 +631,25 @@ class VamanaContiguousStreamerEntity : public VamanaMmapStreamerEntity { return VamanaMmapStreamerEntity::add_vector_with_id(id, vec); } - inline TypedNeighbors get_neighbors_typed(node_id_t id) const { - if (ailego_likely(node_base_ != nullptr)) { - const char *ptr = node_base_ + static_cast(id) * node_size() + - vector_size() + sizeof(key_t); + ailego_force_inline TypedNeighbors get_neighbors_typed(node_id_t id) const { + if (ailego_likely(graph_base_ != nullptr)) { + // graph layout: [key (sizeof(key_t)) | NeighborsHeader + neighbors] + const char *ptr = + graph_base_ + static_cast(id) * graph_stride_ + sizeof(key_t); MmapMemoryBlock block(const_cast(ptr)); return TypedNeighbors(std::move(block)); } return VamanaMmapStreamerEntity::get_neighbors_typed(id); } - inline int get_vector_typed(const node_id_t *ids, uint32_t count, - std::vector &vec_blocks) const { - if (ailego_likely(node_base_ != nullptr)) { + ailego_force_inline int get_vector_typed( + const node_id_t *ids, uint32_t count, + std::vector &vec_blocks) const { + if (ailego_likely(vector_base_ != nullptr)) { vec_blocks.resize(count); for (auto i = 0U; i < count; ++i) { const char *ptr = - node_base_ + static_cast(ids[i]) * node_size(); + vector_base_ + static_cast(ids[i]) * vector_stride_; vec_blocks[i].reset(const_cast(ptr)); } return 0; @@ -635,37 +657,53 @@ class VamanaContiguousStreamerEntity : public VamanaMmapStreamerEntity { return VamanaMmapStreamerEntity::get_vector_typed(ids, count, vec_blocks); } - inline key_t get_key_typed(node_id_t id) const { - if (ailego_likely(node_base_ != nullptr)) { + ailego_force_inline key_t get_key_typed(node_id_t id) const { + if (ailego_likely(graph_base_ != nullptr)) { if (!use_key_info_map_) return id; - const char *ptr = - node_base_ + static_cast(id) * node_size() + vector_size(); + // key is at offset 0 within each graph node + const char *ptr = graph_base_ + static_cast(id) * graph_stride_; return *reinterpret_cast(ptr); } return VamanaMmapStreamerEntity::get_key_typed(id); } + //! Direct vector pointer from flat vector array. + //! Stride is padded up to kVectorAlignment (64B) to preserve cache-line + //! alignment even when vector_size is not a multiple of 64. The padding is + //! purely in-memory and does NOT affect the on-disk index file layout. + ailego_force_inline const void *get_vector_ptr(node_id_t id) const { + if (ailego_likely(vector_base_ != nullptr)) { + return vector_base_ + static_cast(id) * vector_stride_; + } + return VamanaMmapStreamerEntity::get_vector_ptr(id); + } + protected: - //! Custom deleter for contiguous memory (munmap / _aligned_free / free) + //! Custom deleter for contiguous memory allocated via + //! MemoryHelper::AllocateHugePage. `size` is the (already huge-page-aligned) + //! length passed at allocation time, required by the mmap/munmap path. struct ContiguousDeleter { size_t size; void operator()(char *ptr) const { - if (!ptr) return; -#if defined(__linux__) || defined(__APPLE__) - ::munmap(ptr, size); -#elif defined(_WIN32) - ::_aligned_free(ptr); -#else - std::free(ptr); -#endif + ailego::MemoryHelper::FreeHugePage(ptr, size); } }; - //! Shared ownership of contiguous memory (enables zero-copy clone) - std::shared_ptr node_memory_{}; - - //! Raw pointer for hot-path access (derived from shared_ptr) - char *node_base_{nullptr}; + //! Flat vector array: vectors stored densely with per-vector stride + //! padded up to kVectorAlignment (64B) to keep each vector's starting + //! address cache-line aligned. Base is page-aligned by the allocator. + std::shared_ptr vector_memory_{}; + char *vector_base_{nullptr}; + //! Per-vector stride = AlignUp(vector_size(), kVectorAlignment). + size_t vector_stride_{0}; + + //! Graph array: [key | neighbors] stored densely (stride = graph_stride_). + std::shared_ptr graph_memory_{}; + char *graph_base_{nullptr}; + size_t graph_stride_{0}; // sizeof(key_t) + neighbors_size() + + //! Cache-line alignment used for per-vector stride in the flat array. + static constexpr size_t kVectorAlignment = 64; private: static char *allocate_contiguous(size_t size); diff --git a/src/core/utility/CMakeLists.txt b/src/core/utility/CMakeLists.txt index 7c3adf702..97e6fa360 100644 --- a/src/core/utility/CMakeLists.txt +++ b/src/core/utility/CMakeLists.txt @@ -6,6 +6,26 @@ if(NOT APPLE) "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") endif() +# block_heap.cc uses AVX2 intrinsics (guarded by __AVX2__) for its +# push_block fast path. When the host toolchain supports it, compile this +# source with an AVX2-capable -march so the AVX2 codegen is enabled. On +# other hosts the scalar fallback inside __AVX2__ guards is compiled +# instead, and callers runtime-gate invocation on CpuFeatures::AVX2. +if(NOT ANDROID AND AUTO_DETECT_ARCH) + if(HOST_ARCH MATCHES "^(x86|x64)$") + setup_compiler_march_for_x86( + _BLOCK_HEAP_MARCH_SSE _BLOCK_HEAP_MARCH_AVX2 + _BLOCK_HEAP_MARCH_AVX512 _BLOCK_HEAP_MARCH_AVX512FP16) + if(_BLOCK_HEAP_MARCH_AVX2) + set_source_files_properties( + ${CMAKE_CURRENT_SOURCE_DIR}/block_heap.cc + PROPERTIES + COMPILE_FLAGS "${_BLOCK_HEAP_MARCH_AVX2}" + ) + endif() + endif() +endif() + cc_library( NAME core_utility STATIC SHARED STRICT ALWAYS_LINK diff --git a/src/core/utility/block_heap.cc b/src/core/utility/block_heap.cc new file mode 100644 index 000000000..bef50663f --- /dev/null +++ b/src/core/utility/block_heap.cc @@ -0,0 +1,184 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// This translation unit hosts the AVX2-accelerated push_block implementation +// for BlockHeap. The build system compiles this .cc with an AVX2-capable +// `-march` when the toolchain/host supports it (see src/core/CMakeLists.txt +// and src/core/utility/CMakeLists.txt); otherwise the scalar fallback below +// is used. Callers must still runtime-gate invocation on CpuFeatures::AVX2 +// because a binary compiled on an AVX2 host may be deployed on a lower-arch +// machine, in which case the AVX2 code here would fault. + +// linear_pool.h (pulled in by block_heap.h) uses printf but does not +// #include ; include it here before block_heap.h so the template +// non-dependent name lookup at definition time succeeds. +#include "block_heap.h" +#include +#include +#include +#include "zvec/ailego/internal/platform.h" + +#if defined(__AVX2__) +#include +#endif + +namespace zvec { +namespace core { + +void BlockHeap::reset(int32_t capacity, int32_t block_size) { + ef_ = capacity; + block_size_ = block_size; + data_.clear(); + const size_t reserve_cnt = + static_cast(std::max(capacity, block_size)) + + static_cast(block_size); + data_.reserve(reserve_cnt); + tmp_.clear(); + tmp_.reserve(static_cast(block_size)); + cur_ = 0; +} + +uint32_t BlockHeap::pop() { + size_t ret_idx = cur_; + set_checked(data_[cur_].first); + while (cur_ < data_.size() && is_checked(data_[cur_].first)) { + ++cur_; + } + return get_id(data_[ret_idx].first); +} + +void BlockHeap::to_sorted(uint32_t *ids, float *scores, int32_t length) const { + const int32_t n = std::min(length, static_cast(data_.size())); + for (int32_t i = 0; i < n; ++i) { + ids[i] = get_id(data_[i].first); + if (scores != nullptr) { + scores[i] = data_[i].second; + } + } +} + +void BlockHeap::push_block(const float *distances, const uint32_t *nodes, + int32_t block_size) { + // Phase 1: collect candidates with dist < current threshold into tmp_. + if (static_cast(data_.size()) == ef_) { + const float max_dist = data_.back().second; +#if defined(__AVX2__) + const __m256 threshold_vec = _mm256_set1_ps(max_dist); + int32_t i = 0; + for (; i + 8 <= block_size; i += 8) { + __m256 d = _mm256_loadu_ps(distances + i); + __m256 mask = _mm256_cmp_ps(d, threshold_vec, _CMP_LT_OS); + int bitmask = _mm256_movemask_ps(mask); + if (bitmask == 0) { + continue; + } + while (bitmask) { + int tz = ailego_ctz32(bitmask); + tmp_.emplace_back(nodes[i + tz], distances[i + tz]); + bitmask &= bitmask - 1; + } + } + for (; i < block_size; ++i) { + if (distances[i] < max_dist) { + tmp_.emplace_back(nodes[i], distances[i]); + } + } +#else + for (int32_t i = 0; i < block_size; ++i) { + if (distances[i] < max_dist) { + tmp_.emplace_back(nodes[i], distances[i]); + } + } +#endif + } else { + for (int32_t i = 0; i < block_size; ++i) { + tmp_.emplace_back(nodes[i], distances[i]); + } + } + if (tmp_.empty()) { + return; + } + + // Phase 2: sort tmp_ ascending by distance (with ef truncation). + auto cmp = [](const std::pair &a, + const std::pair &b) { + return a.second < b.second; + }; + if (static_cast(tmp_.size()) > ef_) { + // nth_element + sort of the top-ef slice is O(n) + O(k log k), which is + // faster than partial_sort's O(n log k) for the hot path. + std::nth_element(tmp_.begin(), tmp_.begin() + ef_, tmp_.end(), cmp); + tmp_.resize(static_cast(ef_)); + } + if (tmp_.size() <= 32) { + // Insertion sort for small arrays — branch-predictor friendly and has + // lower overhead than std::sort for tiny inputs. + for (size_t i = 1; i < tmp_.size(); ++i) { + auto key = tmp_[i]; + int32_t j = static_cast(i) - 1; + while (j >= 0 && tmp_[j].second > key.second) { + tmp_[j + 1] = tmp_[j]; + --j; + } + tmp_[j + 1] = key; + } + } else { + std::sort(tmp_.begin(), tmp_.end(), cmp); + } + + // Phase 3: in-place merge (tail-write) data_ and tmp_, truncated at ef_. + const int32_t old_data_size = static_cast(data_.size()); + const int32_t tmp_size = static_cast(tmp_.size()); + int32_t i = old_data_size - 1; + int32_t j = tmp_size - 1; + int32_t write_pos = old_data_size + tmp_size - 1; + data_.resize(std::min(static_cast(old_data_size + tmp_size), + static_cast(ef_))); + // Drop the overflow tail (entries past ef_): advance i/j without writing, + // since data_[write_pos] would be out of bounds. + while (write_pos >= ef_) { + if (data_[i].second > tmp_[j].second) { + --i; + } else { + --j; + } + --write_pos; + } + // Merge phase: consume the larger of data_[i]/tmp_[j] into data_[write_pos]. + while (i >= 0 && j >= 0) { + if (data_[i].second > tmp_[j].second) { + data_[write_pos--] = data_[i--]; + } else { + data_[write_pos--] = tmp_[j--]; + } + } + if (j >= 0) { + // tmp_ entries remaining at front — copy them and reset cursor so the + // caller re-scans from the head. + while (j >= 0) { + data_[write_pos--] = tmp_[j--]; + } + cur_ = 0; + } else { + // All tmp_ entries consumed; old data_[0..i] are already in place. + // Move cursor back if new items were inserted ahead of it. + if (static_cast(write_pos + 1) <= cur_) { + cur_ = static_cast(write_pos + 1); + } + } + tmp_.clear(); +} + +} // namespace core +} // namespace zvec diff --git a/src/core/utility/block_heap.h b/src/core/utility/block_heap.h new file mode 100644 index 000000000..ba3b07ee1 --- /dev/null +++ b/src/core/utility/block_heap.h @@ -0,0 +1,116 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +namespace zvec { +namespace core { + +// BlockHeap is a block-insert optimized alternative to LinearPool for graph +// search. It receives candidates in batches (push_block) and maintains a +// distance-sorted prefix of size ef with amortized O(k) bookkeeping per +// batch, replacing LinearPool's one-by-one sorted insert. +// +// Derived from pyglass' BlockHeap (https://github.com/zilliztech/pyglass, +// MIT License; see the NOTICE file and linear_pool.h for the full attribution). +// The graph prefetch is intentionally omitted: the call-site is expected to +// issue the neighbor-array prefetch itself (Vamana's greedy_search already +// does so). +// +// AVX2 requirement +// ---------------- +// The implementation uses AVX2 intrinsics in push_block for the common case +// where the pool is full and we need to filter a block of candidates against +// the current distance threshold. The intrinsics are confined to +// block_heap.cc and guarded with `#if defined(__AVX2__)`, so this header is +// always safe to include. Callers MUST gate the invocation of BlockHeap-based +// code paths on a runtime CpuFeatures::AVX2 check to avoid illegal +// instructions when running on a low-arch machine with a binary built on a +// higher-arch host. +struct BlockHeap { + BlockHeap() = default; + ~BlockHeap() = default; + + BlockHeap(const BlockHeap &) = delete; + BlockHeap &operator=(const BlockHeap &) = delete; + + BlockHeap(BlockHeap &&) = default; + BlockHeap &operator=(BlockHeap &&) = default; + + // Reset the pool state for a new search round. `capacity` is the retained + // top-k size, `block_size` is an upper bound on the per-call push_block size + // (used only for capacity hints). Visited-node tracking is no longer owned + // by the pool — the caller passes a VisitFilter reference instead. + void reset(int32_t capacity, int32_t block_size); + + // Insert a block of candidates. The distance array must have at least + // `block_size` entries and the id array must have the same length. + // `block_size` may differ from the value passed to reset(); reset()'s + // block_size is only a capacity hint. + void push_block(const float *distances, const uint32_t *nodes, + int32_t block_size); + + // Is there an unpopped candidate? + bool has_next() const { + return cur_ < data_.size(); + } + + // Pop the closest unpopped candidate id (without the check bit). + // Caller must ensure has_next() is true. + uint32_t pop(); + + // Retained candidate count. + int32_t size() const { + return static_cast(data_.size()); + } + + // Export sorted top-`length` ids (and optionally scores) — data_ is already + // distance-sorted ascending. + void to_sorted(uint32_t *ids, float *scores, int32_t length) const; + + // Direct sorted accessors (used by search result copy-out). + uint32_t id(int32_t i) const { + return get_id(data_[i].first); + } + float dist(int32_t i) const { + return data_[i].second; + } + + // Internal check-bit helpers (high bit marks a popped entry). + static constexpr uint32_t kCheckedBit = 0x80000000u; + static constexpr uint32_t kIdMask = 0x7FFFFFFFu; + + static void set_checked(uint32_t &id) { + id |= kCheckedBit; + } + static bool is_checked(uint32_t id) { + return (id & kCheckedBit) != 0u; + } + static uint32_t get_id(uint32_t id) { + return id & kIdMask; + } + + private: + std::vector> data_; + std::vector> tmp_; + int32_t ef_{0}; + int32_t block_size_{0}; + size_t cur_{0}; +}; + +} // namespace core +} // namespace zvec diff --git a/src/core/utility/linear_pool.h b/src/core/utility/linear_pool.h new file mode 100644 index 000000000..20c3bc8c4 --- /dev/null +++ b/src/core/utility/linear_pool.h @@ -0,0 +1,237 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// =========================================================================== +// Acknowledgement +// --------------- +// The LinearPool implementation in this file (and the accompanying Neighbor +// helper) is adapted from the pyglass project, with modifications +// (e.g. a BlockHeap-compatible reset()/push_block() interface): +// +// pyglass — Graph Library for Approximate Similarity Search +// https://github.com/zilliztech/pyglass +// +// pyglass is distributed under the MIT License. The original copyright notice +// and permission notice are reproduced below as required by that license: +// +// MIT License +// +// Copyright (c) 2023 zh Wang +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// =========================================================================== +#pragma once + +#include +#include +#include + +namespace zvec { +namespace core { + +namespace linear_pool_impl { + +template +struct Neighbor { + int id; + dist_t distance; + + Neighbor() = default; + Neighbor(int id, dist_t distance) : id(id), distance(distance) {} + + inline friend bool operator<(const Neighbor &lhs, const Neighbor &rhs) { + return lhs.distance < rhs.distance || + (lhs.distance == rhs.distance && lhs.id < rhs.id); + } + + inline friend bool operator>(const Neighbor &lhs, const Neighbor &rhs) { + return !(lhs < rhs); + } +}; + +} // namespace linear_pool_impl + +template +struct LinearPool { + using dist_type = dist_t; + + LinearPool() = default; + + LinearPool(int ef, int capacity) + : ef_(ef), capacity_(capacity), data_(capacity_ + 1) {} + + friend void swap(LinearPool &lhs, LinearPool &rhs) { + using std::swap; + swap(lhs.size_, rhs.size_); + swap(lhs.cur_, rhs.cur_); + swap(lhs.ef_, rhs.ef_); + swap(lhs.capacity_, rhs.capacity_); + swap(lhs.data_, rhs.data_); + } + + LinearPool(const LinearPool &) = delete; + + LinearPool(LinearPool &&rhs) { + swap(*this, rhs); + } + + LinearPool &operator=(const LinearPool &) = delete; + + LinearPool &operator=(LinearPool &&rhs) { + swap(*this, rhs); + return *this; + } + + // Reset the pool state for a new search round. `capacity` is the retained + // top-k size, `block_size` is ignored (kept for API parity with BlockHeap). + // Visited-node tracking is no longer owned by the pool — the caller passes + // a VisitFilter reference to the search loop instead. + void reset(int32_t capacity, int32_t /*block_size_ignored*/) { + size_ = cur_ = 0; + ef_ = capacity; + capacity_ = capacity; + if (data_.size() < static_cast(capacity + 1)) { + data_.resize(capacity + 1); + } + } + + ailego_force_inline int find_bsearch(dist_t dist) { + int lo = 0, hi = size_; + while (lo < hi) { + int mid = (lo + hi) / 2; + if (data_[mid].distance > dist) { + hi = mid; + } else { + lo = mid + 1; + } + } + return lo; + // int len = size_; + // int loc = 0; + // while (len > 1) { + // int half = len / 2; + // loc += (dist > data_[loc + half - 1].distance) * half; + // len -= half; + // } + // return loc; + } + + // Block-insert interface matching BlockHeap::push_block: insert each + // (node, distance) pair via the one-by-one sorted insert(). Used by the + // templated greedy_search helpers so that LinearPool can be plugged in + // when AVX2 is unavailable. + void push_block(const float *distances, const uint32_t *nodes, + int32_t block_size) { + for (int32_t i = 0; i < block_size; ++i) { + insert(static_cast(nodes[i]), static_cast(distances[i])); + } + } + + ailego_force_inline bool insert(int u, dist_t dist) { + if (size_ == capacity_ && dist >= data_[size_ - 1].distance) { + return false; + } + int lo = find_bsearch(dist); + std::memmove(&data_[lo + 1], &data_[lo], + (size_ - lo) * sizeof(linear_pool_impl::Neighbor)); + data_[lo] = {u, dist}; + if (size_ < capacity_) { + size_++; + } + if (lo < cur_) { + cur_ = lo; + } + return true; + } + + int pop() { + set_checked(data_[cur_].id); + int pre = cur_; + while (cur_ < size_ && is_checked(data_[cur_].id)) { + cur_++; + } + return get_id(data_[pre].id); + } + + bool has_next() const { + return cur_ < size_ && cur_ < ef_; + } + int id(int i) const { + return get_id(data_[i].id); + } + dist_type dist(int i) const { + return data_[i].distance; + } + int size() const { + return size_; + } + int capacity() const { + return capacity_; + } + + constexpr static int kMask = 2147483647; + int get_id(int id) const { + return id & kMask; + } + void set_checked(int &id) { + id |= 1 << 31; + } + bool is_checked(int id) const { + return id >> 31 & 1; + } + + void to_sorted(int32_t *ids, float *scores, int32_t length) const { + for (int32_t i = 0; i < length; ++i) { + ids[i] = id(i); + if (scores) { + scores[i] = dist(i); + } + } + } + + int size_ = 0, cur_ = 0, ef_ = 0, capacity_ = 0; + std::vector> data_; +}; + +// Copy a single-heap pool's (LinearPool/BlockHeap) distance-sorted retained +// results into a topk heap. Both the pool and the topk heap are template +// parameters so this helper is independent of any per-algorithm TopkHeap alias +// and can be shared by Vamana and HNSW greedy search. +template +void copy_pool_to_topk(const PoolType &pool, TopkType &topk) { + const int32_t n = static_cast(pool.size()); + for (int32_t i = 0; i < n; ++i) { + topk.emplace(pool.id(i), pool.dist(i)); + } +} + +} // namespace core +} // namespace zvec diff --git a/src/include/zvec/ailego/internal/platform.h b/src/include/zvec/ailego/internal/platform.h index d30cb8865..2b1c1a0e1 100644 --- a/src/include/zvec/ailego/internal/platform.h +++ b/src/include/zvec/ailego/internal/platform.h @@ -219,6 +219,13 @@ static inline int ailego_clz64(uint64_t x) { #define ailego_prefetch(p) (__builtin_prefetch((p))) #endif // _MSC_VER +//! Force-inline hint: use on hot-path accessors (3-5 instructions). +#ifdef _MSC_VER +#define ailego_force_inline __forceinline +#else +#define ailego_force_inline inline __attribute__((always_inline)) +#endif + #if defined(AILEGO_M64) #define ailego_ctz ailego_ctz64 #define ailego_clz ailego_clz64 diff --git a/src/turbo/avx512_vnni/record_quantized_int8/common.h b/src/turbo/avx512_vnni/record_quantized_int8/common.h index 465caf6cf..e72c8f014 100644 --- a/src/turbo/avx512_vnni/record_quantized_int8/common.h +++ b/src/turbo/avx512_vnni/record_quantized_int8/common.h @@ -26,12 +26,7 @@ #include #include #include - -#ifdef _MSC_VER -#define TURBO_ALWAYS_INLINE __forceinline -#else -#define TURBO_ALWAYS_INLINE inline __attribute__((always_inline)) -#endif +#include namespace zvec::turbo::avx512_vnni::internal { @@ -48,7 +43,7 @@ static inline int32_t HorizontalAdd_INT32_V256(__m256i v) { // Compute the raw integer inner product of two int8 vectors of length `size`. // The result is written to `*distance` as a float. // Both `a` and `b` must point to int8_t arrays. -static TURBO_ALWAYS_INLINE void ip_int8_avx512_vnni(const void *a, +static ailego_force_inline void ip_int8_avx512_vnni(const void *a, const void *b, size_t size, float *distance) { const __m256i ONES_INT16_AVX = _mm256_set1_epi32(0x00010001); @@ -219,7 +214,7 @@ static TURBO_ALWAYS_INLINE void ip_int8_avx512_vnni(const void *a, // Shift the first `original_dim` bytes of `query` in-place from int8 to uint8 // by adding 128 to each element. The metadata tail beyond `original_dim` is // left untouched. This prepares the query for use with dpbusd (uint8 * int8). -static TURBO_ALWAYS_INLINE void shift_int8_to_uint8_avx512( +static ailego_force_inline void shift_int8_to_uint8_avx512( void *query, size_t original_dim) { const int8_t *input = reinterpret_cast(query); uint8_t *output = reinterpret_cast(query); @@ -244,7 +239,7 @@ static TURBO_ALWAYS_INLINE void shift_int8_to_uint8_avx512( // single query. Uses AVX512-VNNI dpbusd instruction. // `query` is treated as uint8 (preprocessed), `vectors[i]` as int8. template -TURBO_ALWAYS_INLINE void ip_int8_batch_avx512_vnni_impl( +ailego_force_inline void ip_int8_batch_avx512_vnni_impl( const void *query, const void *const *vectors, const std::array &prefetch_ptrs, size_t dimensionality, float *distances) { @@ -289,7 +284,7 @@ TURBO_ALWAYS_INLINE void ip_int8_batch_avx512_vnni_impl( } // Dispatch batched inner product over all `n` vectors with prefetching. -static TURBO_ALWAYS_INLINE void ip_int8_batch_avx512_vnni( +static ailego_force_inline void ip_int8_batch_avx512_vnni( const void *const *vectors, const void *query, size_t n, size_t dim, float *distances) { static constexpr size_t batch_size = 2;