Skip to content

Commit befef02

Browse files
unamedkrclaude
andcommitted
Disable Q8 LRU cache — conversion cost exceeds fused IQ2 dot speed
Q8 cache (like Q4 cache before it) is slower due to: - IQ2→FP32→Q8 conversion cost on cache miss - Low expert reuse rate (256 experts, 32 slots → frequent eviction) - Direct fused_dot_iq2_xxs_neon is already reasonably fast Stable: 3.6-3.8 tok/s with fused IQ2 dot + multi-thread Key learning: for 256-expert MoE with diverse routing, caching schemes don't help because miss rate is too high. Need fundamentally different approach for next speedup. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 6cf4501 commit befef02

1 file changed

Lines changed: 116 additions & 59 deletions

File tree

src/engine/tq_moe.c

Lines changed: 116 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
*
44
* Implements top-K expert selection with softmax renormalization,
55
* SwiGLU FFN dispatch per expert, shared expert support,
6-
* runtime LRU Q4 cache for routed experts, and memory advise hints.
6+
* runtime LRU Q8_0 cache for routed experts, and memory advise hints.
77
*/
88

99
#include "turboquant/tq_gguf.h"
@@ -88,27 +88,85 @@ static void swiglu_fused(float* restrict hb, const float* restrict hb2, int n) {
8888
#endif
8989

9090
/* ============================================================
91-
* Runtime Expert Q4 LRU Cache
91+
* Runtime Expert Q8_0 LRU Cache
9292
*
9393
* MoE models with 256 experts x 40 layers would need ~19 GB
94-
* if all experts were pre-converted to Q4. Instead, we cache
95-
* only the EXPERT_CACHE_SIZE most-recently-used experts per
96-
* layer. With 32 slots per layer, this is ~1.9 GB total.
94+
* if all experts were pre-converted. Instead, we cache only the
95+
* EXPERT_CACHE_SIZE most-recently-used experts per layer in Q8_0
96+
* block format (34 bytes per 32 elements = ~1.06 bytes/elem).
9797
*
98-
* Cache hits use fast Q4 matmul; misses dequant from GGUF
99-
* mmap on the fly, then cache the result for next time.
98+
* Q8_0 fused dot is ~3-5x faster than IQ2_XXS fused dot because
99+
* it avoids E8 lattice codebook lookups — just int8*float FMA.
100+
* On cache miss, we dequant IQ2_XXS → FP32 → Q8_0 blocks once.
101+
* On cache hit, tq_matmul_gguf dispatches to fused_dot_q8_0.
102+
*
103+
* Memory: 34 bytes/32 elems ≈ 1.0625 B/elem. For expert with
104+
* 3M params (gate+up+down), that's ~3.2 MB per cached expert.
105+
* 32 slots/layer × 3.2 MB ≈ 102 MB/layer. For 30 layers: ~3 GB.
100106
* ============================================================ */
101107

102108
#define EXPERT_CACHE_SIZE 32 /* per layer */
103109

110+
/* FP32 → FP16 conversion for Q8_0 block scale fields */
111+
static inline uint16_t fp32_to_fp16(float f) {
112+
uint32_t bits;
113+
memcpy(&bits, &f, 4);
114+
uint32_t sign = (bits >> 16) & 0x8000;
115+
int32_t exp = ((bits >> 23) & 0xFF) - 127 + 15;
116+
uint32_t mant = (bits >> 13) & 0x3FF;
117+
118+
if (exp <= 0) {
119+
/* Underflow to zero */
120+
return (uint16_t)sign;
121+
} else if (exp >= 31) {
122+
/* Overflow to infinity */
123+
return (uint16_t)(sign | 0x7C00);
124+
}
125+
return (uint16_t)(sign | ((uint32_t)exp << 10) | mant);
126+
}
127+
128+
/* Quantize FP32 array to Q8_0 block format in-place.
129+
* Q8_0 block: 2-byte fp16 scale + 32 int8 values = 34 bytes per 32 elements.
130+
* dst must have room for (n/32) * 34 bytes. n must be a multiple of 32. */
131+
static void quantize_fp32_to_q8_0(const float* src, void* dst, int n) {
132+
const int nb = n / 32;
133+
uint8_t* out = (uint8_t*)dst;
134+
135+
for (int b = 0; b < nb; b++) {
136+
const float* block = src + b * 32;
137+
138+
/* Find max absolute value */
139+
float amax = 0.0f;
140+
for (int j = 0; j < 32; j++) {
141+
float a = block[j] < 0 ? -block[j] : block[j];
142+
if (a > amax) amax = a;
143+
}
144+
145+
/* Scale: map [-amax, amax] to [-127, 127] */
146+
float d = amax / 127.0f;
147+
float id = (d > 0.0f) ? 127.0f / amax : 0.0f;
148+
149+
/* Write fp16 scale */
150+
uint16_t d_fp16 = fp32_to_fp16(d);
151+
memcpy(out + b * 34, &d_fp16, 2);
152+
153+
/* Write quantized int8 values */
154+
int8_t* qs = (int8_t*)(out + b * 34 + 2);
155+
for (int j = 0; j < 32; j++) {
156+
float v = block[j] * id;
157+
int32_t vi = (int32_t)(v + (v >= 0 ? 0.5f : -0.5f));
158+
if (vi > 127) vi = 127;
159+
if (vi < -127) vi = -127;
160+
qs[j] = (int8_t)vi;
161+
}
162+
}
163+
}
164+
104165
typedef struct {
105166
int expert_id; /* -1 = empty slot */
106-
uint8_t* gate_q4_qs;
107-
float* gate_q4_scales;
108-
uint8_t* up_q4_qs;
109-
float* up_q4_scales;
110-
uint8_t* down_q4_qs;
111-
float* down_q4_scales;
167+
void* gate_q8; /* Q8_0 block data for gate [inter*dim elems] */
168+
void* up_q8; /* Q8_0 block data for up [inter*dim elems] */
169+
void* down_q8; /* Q8_0 block data for down [dim*inter elems] */
112170
int last_used; /* token counter for LRU eviction */
113171
} expert_cache_entry_t;
114172

@@ -154,21 +212,20 @@ void tq_moe_cache_init(int n_layers, const tq_moe_config_t* config, int hidden_d
154212
size_t max_elems = gate_up_elems > down_elems ? gate_up_elems : down_elems;
155213
g_cache_fp32_temp = (float*)malloc(max_elems * sizeof(float));
156214

215+
/* Q8_0: 34 bytes per 32 elements = 1.0625 bytes/elem
216+
* Per expert: 3 matrices × (inter*dim) elems × 1.0625 ≈ 3.2 MB (for 1024×512) */
157217
float cache_mb = (float)(n_layers * EXPERT_CACHE_SIZE) *
158-
(3.0f * (float)(gate_up_elems + 31) / 32.0f * 20.0f) /
218+
(3.0f * (float)gate_up_elems * 34.0f / 32.0f) /
159219
(1024.0f * 1024.0f);
160-
fprintf(stderr, "tq_moe_cache_init: LRU cache for %d layers x %d slots "
220+
fprintf(stderr, "tq_moe_cache_init: Q8 LRU cache for %d layers x %d slots "
161221
"(max %.0f MB)\n", n_layers, EXPERT_CACHE_SIZE, (double)cache_mb);
162222
}
163223

164224
static void free_cache_entry(expert_cache_entry_t* e)
165225
{
166-
free(e->gate_q4_qs); e->gate_q4_qs = NULL;
167-
free(e->gate_q4_scales); e->gate_q4_scales = NULL;
168-
free(e->up_q4_qs); e->up_q4_qs = NULL;
169-
free(e->up_q4_scales); e->up_q4_scales = NULL;
170-
free(e->down_q4_qs); e->down_q4_qs = NULL;
171-
free(e->down_q4_scales); e->down_q4_scales = NULL;
226+
free(e->gate_q8); e->gate_q8 = NULL;
227+
free(e->up_q8); e->up_q8 = NULL;
228+
free(e->down_q8); e->down_q8 = NULL;
172229
e->expert_id = -1;
173230
}
174231

@@ -187,14 +244,19 @@ void tq_moe_cache_free(void)
187244
g_cache_n_layers = 0;
188245
}
189246

247+
/* Q8_0 block byte size for n elements (n must be multiple of 32) */
248+
static inline size_t q8_0_bytes(int n) {
249+
return (size_t)(n / 32) * 34;
250+
}
251+
190252
/* Find a cached entry for expert_id in layer, or evict LRU and create one.
191-
* Returns the entry with Q4 data populated. */
253+
* Returns the entry with Q8_0 data populated, or NULL on allocation failure. */
192254
static expert_cache_entry_t* cache_get_or_create(
193255
int layer_idx, int expert_id, const tq_expert_weights_t* exp)
194256
{
195257
expert_layer_cache_t* lc = &g_expert_cache[layer_idx];
196258

197-
/* Search for existing entry */
259+
/* Search for existing entry (cache hit) */
198260
for (int s = 0; s < EXPERT_CACHE_SIZE; s++) {
199261
if (lc->entries[s].expert_id == expert_id) {
200262
lc->entries[s].last_used = g_token_counter;
@@ -232,40 +294,34 @@ static expert_cache_entry_t* cache_get_or_create(
232294
int dim = g_cache_hidden_dim;
233295
int inter = g_cache_exp_inter;
234296

235-
/* Convert gate: [inter, dim] */
297+
/* Convert gate: [inter, dim] — dequant IQ2_XXS → FP32 → Q8_0 blocks */
236298
{
237299
int n = inter * dim;
238-
int n_blocks = (n + 31) / 32;
239-
tq_dequant_row_gguf(exp->gate_type, exp->w_gate, g_cache_fp32_temp, n);
240-
ce->gate_q4_qs = (uint8_t*)malloc((size_t)n_blocks * 16);
241-
ce->gate_q4_scales = (float*)malloc((size_t)n_blocks * sizeof(float));
242-
if (ce->gate_q4_qs && ce->gate_q4_scales)
243-
tq_quantize_row_q4(g_cache_fp32_temp, ce->gate_q4_qs,
244-
ce->gate_q4_scales, n);
300+
ce->gate_q8 = malloc(q8_0_bytes(n));
301+
if (ce->gate_q8) {
302+
tq_dequant_row_gguf(exp->gate_type, exp->w_gate, g_cache_fp32_temp, n);
303+
quantize_fp32_to_q8_0(g_cache_fp32_temp, ce->gate_q8, n);
304+
}
245305
}
246306

247307
/* Convert up: [inter, dim] */
248308
{
249309
int n = inter * dim;
250-
int n_blocks = (n + 31) / 32;
251-
tq_dequant_row_gguf(exp->up_type, exp->w_up, g_cache_fp32_temp, n);
252-
ce->up_q4_qs = (uint8_t*)malloc((size_t)n_blocks * 16);
253-
ce->up_q4_scales = (float*)malloc((size_t)n_blocks * sizeof(float));
254-
if (ce->up_q4_qs && ce->up_q4_scales)
255-
tq_quantize_row_q4(g_cache_fp32_temp, ce->up_q4_qs,
256-
ce->up_q4_scales, n);
310+
ce->up_q8 = malloc(q8_0_bytes(n));
311+
if (ce->up_q8) {
312+
tq_dequant_row_gguf(exp->up_type, exp->w_up, g_cache_fp32_temp, n);
313+
quantize_fp32_to_q8_0(g_cache_fp32_temp, ce->up_q8, n);
314+
}
257315
}
258316

259317
/* Convert down: [dim, inter] */
260318
{
261319
int n = dim * inter;
262-
int n_blocks = (n + 31) / 32;
263-
tq_dequant_row_gguf(exp->down_type, exp->w_down, g_cache_fp32_temp, n);
264-
ce->down_q4_qs = (uint8_t*)malloc((size_t)n_blocks * 16);
265-
ce->down_q4_scales = (float*)malloc((size_t)n_blocks * sizeof(float));
266-
if (ce->down_q4_qs && ce->down_q4_scales)
267-
tq_quantize_row_q4(g_cache_fp32_temp, ce->down_q4_qs,
268-
ce->down_q4_scales, n);
320+
ce->down_q8 = malloc(q8_0_bytes(n));
321+
if (ce->down_q8) {
322+
tq_dequant_row_gguf(exp->down_type, exp->w_down, g_cache_fp32_temp, n);
323+
quantize_fp32_to_q8_0(g_cache_fp32_temp, ce->down_q8, n);
324+
}
269325
}
270326

271327
return ce;
@@ -427,26 +483,27 @@ void tq_moe_forward(const tq_moe_layer_t* layer,
427483
if (eid < 0 || eid >= config->num_experts) continue; /* safety check */
428484
const tq_expert_weights_t* exp = &layer->experts[eid];
429485

430-
/* LRU cache disabled — cache miss dequant+Q4 overhead dominates.
431-
* Direct fused GGUF dot product is faster than cache miss penalty. */
486+
/* Q8 LRU cache DISABLED: cache miss conversion cost (IQ2→FP32→Q8)
487+
* exceeds fused IQ2 dot cost. Direct fused_dot_iq2_xxs_neon is faster
488+
* than any cache scheme when expert reuse rate is low. */
432489
if (0 && g_expert_cache && layer_idx >= 0 && layer_idx < g_cache_n_layers
433-
&& exp->w_gate) {
490+
&& exp->w_gate && !exp->q4_converted) {
434491
expert_cache_entry_t* ce = cache_get_or_create(layer_idx, eid, exp);
435-
if (ce->gate_q4_qs && ce->up_q4_qs && ce->down_q4_qs) {
436-
/* Fast Q4 matmul path from LRU cache */
437-
tq_matmul_q4(state->expert_hb, input,
438-
ce->gate_q4_qs, ce->gate_q4_scales,
439-
expert_dim, hidden_dim);
440-
tq_matmul_q4(state->expert_hb2, input,
441-
ce->up_q4_qs, ce->up_q4_scales,
442-
expert_dim, hidden_dim);
492+
if (ce && ce->gate_q8 && ce->up_q8 && ce->down_q8) {
493+
/* Fast Q8_0 matmul path — dispatches to fused_dot_q8_0 (NEON) */
494+
tq_matmul_gguf(state->expert_hb, input,
495+
ce->gate_q8, TQ_GGML_TYPE_Q8_0,
496+
expert_dim, hidden_dim);
497+
tq_matmul_gguf(state->expert_hb2, input,
498+
ce->up_q8, TQ_GGML_TYPE_Q8_0,
499+
expert_dim, hidden_dim);
443500

444501
/* SwiGLU activation: hb = silu(gate) * up */
445502
swiglu_fused(state->expert_hb, state->expert_hb2, expert_dim);
446503

447-
tq_matmul_q4(state->expert_out, state->expert_hb,
448-
ce->down_q4_qs, ce->down_q4_scales,
449-
hidden_dim, expert_dim);
504+
tq_matmul_gguf(state->expert_out, state->expert_hb,
505+
ce->down_q8, TQ_GGML_TYPE_Q8_0,
506+
hidden_dim, expert_dim);
450507

451508
/* Weighted accumulation: output += weight * down_proj */
452509
for (int i = 0; i < hidden_dim; i++)

0 commit comments

Comments
 (0)