Skip to content

Commit d519ddf

Browse files
unamedkrclaude
andcommitted
Accelerate cblas: implemented but disabled — dequant cost too high
cblas_sgemv (AMX) is 36x faster than manual dot for FP32 512×2048. BUT: IQ2→FP32 dequant per cache miss costs ~15ms, overwhelming the 0.019ms cblas gain. With 256 experts/layer, miss rate is >90%. Key learning: cblas alone: 12ms/token (83 tok/s) — incredible dequant+cblas: 1720ms/token (0.6 tok/s) — dequant dominates fused IQ2 dot: 370ms/token (3.7 tok/s) — no dequant needed cblas would win IF we could pre-dequant all experts (90 GB FP32). The fundamental bottleneck is IQ2_XXS decode complexity. Added: Accelerate framework linkage, cblas cache infrastructure, Metal matmul test (individual IQ2 kernel works, MoE dispatch hangs). Stable: 3.7 tok/s (fused IQ2 NEON dot, 6 threads) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 2c6ea08 commit d519ddf

5 files changed

Lines changed: 1014 additions & 2 deletions

File tree

CMakeLists.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@ add_library(turboquant STATIC
2727
target_include_directories(turboquant PUBLIC include)
2828
target_link_libraries(turboquant PRIVATE m Threads::Threads)
2929

30+
# Apple Accelerate framework (cblas_sgemv via AMX coprocessor)
31+
if(APPLE)
32+
find_library(ACCELERATE_LIB Accelerate)
33+
if(ACCELERATE_LIB)
34+
target_link_libraries(turboquant PRIVATE ${ACCELERATE_LIB})
35+
target_compile_definitions(turboquant PRIVATE TQ_HAS_ACCELERATE=1 ACCELERATE_NEW_LAPACK=1)
36+
message(STATUS "TurboQuant: Accelerate framework enabled (cblas/AMX)")
37+
endif()
38+
endif()
39+
3040
# Shared library for Python bindings
3141
add_library(turboquant_shared SHARED
3242
${TQ_CORE_SOURCES}
@@ -36,6 +46,12 @@ add_library(turboquant_shared SHARED
3646
)
3747
target_include_directories(turboquant_shared PUBLIC include)
3848
target_link_libraries(turboquant_shared PRIVATE m Threads::Threads)
49+
50+
# Accelerate for shared library too
51+
if(APPLE AND ACCELERATE_LIB)
52+
target_link_libraries(turboquant_shared PRIVATE ${ACCELERATE_LIB})
53+
target_compile_definitions(turboquant_shared PRIVATE TQ_HAS_ACCELERATE=1 ACCELERATE_NEW_LAPACK=1)
54+
endif()
3955
set_target_properties(turboquant_shared PROPERTIES
4056
OUTPUT_NAME turboquant
4157
POSITION_INDEPENDENT_CODE ON)
@@ -139,6 +155,11 @@ if(TQ_BUILD_TESTS)
139155
add_executable(${test_name} ${test_src})
140156
target_link_libraries(${test_name} turboquant GTest::gtest_main)
141157
add_test(NAME ${test_name} COMMAND ${test_name})
158+
159+
# Pass Metal availability to test targets
160+
if(TQ_BUILD_METAL AND APPLE)
161+
target_compile_definitions(${test_name} PRIVATE TQ_HAS_METAL=1)
162+
endif()
142163
endforeach()
143164

144165
# llama.cpp integration test

0 commit comments

Comments
 (0)