33 *
44 * Implements top-K expert selection with softmax renormalization,
55 * SwiGLU FFN dispatch per expert, shared expert support,
6- * runtime LRU Q4 cache for routed experts, and memory advise hints.
6+ * runtime LRU Q8_0 cache for routed experts, and memory advise hints.
77 */
88
99#include "turboquant/tq_gguf.h"
@@ -88,27 +88,85 @@ static void swiglu_fused(float* restrict hb, const float* restrict hb2, int n) {
8888#endif
8989
9090/* ============================================================
91- * Runtime Expert Q4 LRU Cache
91+ * Runtime Expert Q8_0 LRU Cache
9292 *
9393 * MoE models with 256 experts x 40 layers would need ~19 GB
94- * if all experts were pre-converted to Q4 . Instead, we cache
95- * only the EXPERT_CACHE_SIZE most-recently-used experts per
96- * layer. With 32 slots per layer, this is ~1.9 GB total .
94+ * if all experts were pre-converted. Instead, we cache only the
95+ * EXPERT_CACHE_SIZE most-recently-used experts per layer in Q8_0
96+ * block format (34 bytes per 32 elements = ~1.06 bytes/elem) .
9797 *
98- * Cache hits use fast Q4 matmul; misses dequant from GGUF
99- * mmap on the fly, then cache the result for next time.
98+ * Q8_0 fused dot is ~3-5x faster than IQ2_XXS fused dot because
99+ * it avoids E8 lattice codebook lookups — just int8*float FMA.
100+ * On cache miss, we dequant IQ2_XXS → FP32 → Q8_0 blocks once.
101+ * On cache hit, tq_matmul_gguf dispatches to fused_dot_q8_0.
102+ *
103+ * Memory: 34 bytes/32 elems ≈ 1.0625 B/elem. For expert with
104+ * 3M params (gate+up+down), that's ~3.2 MB per cached expert.
105+ * 32 slots/layer × 3.2 MB ≈ 102 MB/layer. For 30 layers: ~3 GB.
100106 * ============================================================ */
101107
102108#define EXPERT_CACHE_SIZE 32 /* per layer */
103109
110+ /* FP32 → FP16 conversion for Q8_0 block scale fields */
111+ static inline uint16_t fp32_to_fp16 (float f ) {
112+ uint32_t bits ;
113+ memcpy (& bits , & f , 4 );
114+ uint32_t sign = (bits >> 16 ) & 0x8000 ;
115+ int32_t exp = ((bits >> 23 ) & 0xFF ) - 127 + 15 ;
116+ uint32_t mant = (bits >> 13 ) & 0x3FF ;
117+
118+ if (exp <= 0 ) {
119+ /* Underflow to zero */
120+ return (uint16_t )sign ;
121+ } else if (exp >= 31 ) {
122+ /* Overflow to infinity */
123+ return (uint16_t )(sign | 0x7C00 );
124+ }
125+ return (uint16_t )(sign | ((uint32_t )exp << 10 ) | mant );
126+ }
127+
128+ /* Quantize FP32 array to Q8_0 block format in-place.
129+ * Q8_0 block: 2-byte fp16 scale + 32 int8 values = 34 bytes per 32 elements.
130+ * dst must have room for (n/32) * 34 bytes. n must be a multiple of 32. */
131+ static void quantize_fp32_to_q8_0 (const float * src , void * dst , int n ) {
132+ const int nb = n / 32 ;
133+ uint8_t * out = (uint8_t * )dst ;
134+
135+ for (int b = 0 ; b < nb ; b ++ ) {
136+ const float * block = src + b * 32 ;
137+
138+ /* Find max absolute value */
139+ float amax = 0.0f ;
140+ for (int j = 0 ; j < 32 ; j ++ ) {
141+ float a = block [j ] < 0 ? - block [j ] : block [j ];
142+ if (a > amax ) amax = a ;
143+ }
144+
145+ /* Scale: map [-amax, amax] to [-127, 127] */
146+ float d = amax / 127.0f ;
147+ float id = (d > 0.0f ) ? 127.0f / amax : 0.0f ;
148+
149+ /* Write fp16 scale */
150+ uint16_t d_fp16 = fp32_to_fp16 (d );
151+ memcpy (out + b * 34 , & d_fp16 , 2 );
152+
153+ /* Write quantized int8 values */
154+ int8_t * qs = (int8_t * )(out + b * 34 + 2 );
155+ for (int j = 0 ; j < 32 ; j ++ ) {
156+ float v = block [j ] * id ;
157+ int32_t vi = (int32_t )(v + (v >= 0 ? 0.5f : -0.5f ));
158+ if (vi > 127 ) vi = 127 ;
159+ if (vi < -127 ) vi = -127 ;
160+ qs [j ] = (int8_t )vi ;
161+ }
162+ }
163+ }
164+
104165typedef struct {
105166 int expert_id ; /* -1 = empty slot */
106- uint8_t * gate_q4_qs ;
107- float * gate_q4_scales ;
108- uint8_t * up_q4_qs ;
109- float * up_q4_scales ;
110- uint8_t * down_q4_qs ;
111- float * down_q4_scales ;
167+ void * gate_q8 ; /* Q8_0 block data for gate [inter*dim elems] */
168+ void * up_q8 ; /* Q8_0 block data for up [inter*dim elems] */
169+ void * down_q8 ; /* Q8_0 block data for down [dim*inter elems] */
112170 int last_used ; /* token counter for LRU eviction */
113171} expert_cache_entry_t ;
114172
@@ -154,21 +212,20 @@ void tq_moe_cache_init(int n_layers, const tq_moe_config_t* config, int hidden_d
154212 size_t max_elems = gate_up_elems > down_elems ? gate_up_elems : down_elems ;
155213 g_cache_fp32_temp = (float * )malloc (max_elems * sizeof (float ));
156214
215+ /* Q8_0: 34 bytes per 32 elements = 1.0625 bytes/elem
216+ * Per expert: 3 matrices × (inter*dim) elems × 1.0625 ≈ 3.2 MB (for 1024×512) */
157217 float cache_mb = (float )(n_layers * EXPERT_CACHE_SIZE ) *
158- (3.0f * (float )( gate_up_elems + 31 ) / 32 .0f * 20 .0f ) /
218+ (3.0f * (float )gate_up_elems * 34 .0f / 32 .0f ) /
159219 (1024.0f * 1024.0f );
160- fprintf (stderr , "tq_moe_cache_init: LRU cache for %d layers x %d slots "
220+ fprintf (stderr , "tq_moe_cache_init: Q8 LRU cache for %d layers x %d slots "
161221 "(max %.0f MB)\n" , n_layers , EXPERT_CACHE_SIZE , (double )cache_mb );
162222}
163223
164224static void free_cache_entry (expert_cache_entry_t * e )
165225{
166- free (e -> gate_q4_qs ); e -> gate_q4_qs = NULL ;
167- free (e -> gate_q4_scales ); e -> gate_q4_scales = NULL ;
168- free (e -> up_q4_qs ); e -> up_q4_qs = NULL ;
169- free (e -> up_q4_scales ); e -> up_q4_scales = NULL ;
170- free (e -> down_q4_qs ); e -> down_q4_qs = NULL ;
171- free (e -> down_q4_scales ); e -> down_q4_scales = NULL ;
226+ free (e -> gate_q8 ); e -> gate_q8 = NULL ;
227+ free (e -> up_q8 ); e -> up_q8 = NULL ;
228+ free (e -> down_q8 ); e -> down_q8 = NULL ;
172229 e -> expert_id = -1 ;
173230}
174231
@@ -187,14 +244,19 @@ void tq_moe_cache_free(void)
187244 g_cache_n_layers = 0 ;
188245}
189246
247+ /* Q8_0 block byte size for n elements (n must be multiple of 32) */
248+ static inline size_t q8_0_bytes (int n ) {
249+ return (size_t )(n / 32 ) * 34 ;
250+ }
251+
190252/* Find a cached entry for expert_id in layer, or evict LRU and create one.
191- * Returns the entry with Q4 data populated. */
253+ * Returns the entry with Q8_0 data populated, or NULL on allocation failure . */
192254static expert_cache_entry_t * cache_get_or_create (
193255 int layer_idx , int expert_id , const tq_expert_weights_t * exp )
194256{
195257 expert_layer_cache_t * lc = & g_expert_cache [layer_idx ];
196258
197- /* Search for existing entry */
259+ /* Search for existing entry (cache hit) */
198260 for (int s = 0 ; s < EXPERT_CACHE_SIZE ; s ++ ) {
199261 if (lc -> entries [s ].expert_id == expert_id ) {
200262 lc -> entries [s ].last_used = g_token_counter ;
@@ -232,40 +294,34 @@ static expert_cache_entry_t* cache_get_or_create(
232294 int dim = g_cache_hidden_dim ;
233295 int inter = g_cache_exp_inter ;
234296
235- /* Convert gate: [inter, dim] */
297+ /* Convert gate: [inter, dim] — dequant IQ2_XXS → FP32 → Q8_0 blocks */
236298 {
237299 int n = inter * dim ;
238- int n_blocks = (n + 31 ) / 32 ;
239- tq_dequant_row_gguf (exp -> gate_type , exp -> w_gate , g_cache_fp32_temp , n );
240- ce -> gate_q4_qs = (uint8_t * )malloc ((size_t )n_blocks * 16 );
241- ce -> gate_q4_scales = (float * )malloc ((size_t )n_blocks * sizeof (float ));
242- if (ce -> gate_q4_qs && ce -> gate_q4_scales )
243- tq_quantize_row_q4 (g_cache_fp32_temp , ce -> gate_q4_qs ,
244- ce -> gate_q4_scales , n );
300+ ce -> gate_q8 = malloc (q8_0_bytes (n ));
301+ if (ce -> gate_q8 ) {
302+ tq_dequant_row_gguf (exp -> gate_type , exp -> w_gate , g_cache_fp32_temp , n );
303+ quantize_fp32_to_q8_0 (g_cache_fp32_temp , ce -> gate_q8 , n );
304+ }
245305 }
246306
247307 /* Convert up: [inter, dim] */
248308 {
249309 int n = inter * dim ;
250- int n_blocks = (n + 31 ) / 32 ;
251- tq_dequant_row_gguf (exp -> up_type , exp -> w_up , g_cache_fp32_temp , n );
252- ce -> up_q4_qs = (uint8_t * )malloc ((size_t )n_blocks * 16 );
253- ce -> up_q4_scales = (float * )malloc ((size_t )n_blocks * sizeof (float ));
254- if (ce -> up_q4_qs && ce -> up_q4_scales )
255- tq_quantize_row_q4 (g_cache_fp32_temp , ce -> up_q4_qs ,
256- ce -> up_q4_scales , n );
310+ ce -> up_q8 = malloc (q8_0_bytes (n ));
311+ if (ce -> up_q8 ) {
312+ tq_dequant_row_gguf (exp -> up_type , exp -> w_up , g_cache_fp32_temp , n );
313+ quantize_fp32_to_q8_0 (g_cache_fp32_temp , ce -> up_q8 , n );
314+ }
257315 }
258316
259317 /* Convert down: [dim, inter] */
260318 {
261319 int n = dim * inter ;
262- int n_blocks = (n + 31 ) / 32 ;
263- tq_dequant_row_gguf (exp -> down_type , exp -> w_down , g_cache_fp32_temp , n );
264- ce -> down_q4_qs = (uint8_t * )malloc ((size_t )n_blocks * 16 );
265- ce -> down_q4_scales = (float * )malloc ((size_t )n_blocks * sizeof (float ));
266- if (ce -> down_q4_qs && ce -> down_q4_scales )
267- tq_quantize_row_q4 (g_cache_fp32_temp , ce -> down_q4_qs ,
268- ce -> down_q4_scales , n );
320+ ce -> down_q8 = malloc (q8_0_bytes (n ));
321+ if (ce -> down_q8 ) {
322+ tq_dequant_row_gguf (exp -> down_type , exp -> w_down , g_cache_fp32_temp , n );
323+ quantize_fp32_to_q8_0 (g_cache_fp32_temp , ce -> down_q8 , n );
324+ }
269325 }
270326
271327 return ce ;
@@ -427,26 +483,27 @@ void tq_moe_forward(const tq_moe_layer_t* layer,
427483 if (eid < 0 || eid >= config -> num_experts ) continue ; /* safety check */
428484 const tq_expert_weights_t * exp = & layer -> experts [eid ];
429485
430- /* LRU cache disabled — cache miss dequant+Q4 overhead dominates.
431- * Direct fused GGUF dot product is faster than cache miss penalty. */
486+ /* Q8 LRU cache DISABLED: cache miss conversion cost (IQ2→FP32→Q8)
487+ * exceeds fused IQ2 dot cost. Direct fused_dot_iq2_xxs_neon is faster
488+ * than any cache scheme when expert reuse rate is low. */
432489 if (0 && g_expert_cache && layer_idx >= 0 && layer_idx < g_cache_n_layers
433- && exp -> w_gate ) {
490+ && exp -> w_gate && ! exp -> q4_converted ) {
434491 expert_cache_entry_t * ce = cache_get_or_create (layer_idx , eid , exp );
435- if (ce -> gate_q4_qs && ce -> up_q4_qs && ce -> down_q4_qs ) {
436- /* Fast Q4 matmul path from LRU cache */
437- tq_matmul_q4 (state -> expert_hb , input ,
438- ce -> gate_q4_qs , ce -> gate_q4_scales ,
439- expert_dim , hidden_dim );
440- tq_matmul_q4 (state -> expert_hb2 , input ,
441- ce -> up_q4_qs , ce -> up_q4_scales ,
442- expert_dim , hidden_dim );
492+ if (ce && ce -> gate_q8 && ce -> up_q8 && ce -> down_q8 ) {
493+ /* Fast Q8_0 matmul path — dispatches to fused_dot_q8_0 (NEON) */
494+ tq_matmul_gguf (state -> expert_hb , input ,
495+ ce -> gate_q8 , TQ_GGML_TYPE_Q8_0 ,
496+ expert_dim , hidden_dim );
497+ tq_matmul_gguf (state -> expert_hb2 , input ,
498+ ce -> up_q8 , TQ_GGML_TYPE_Q8_0 ,
499+ expert_dim , hidden_dim );
443500
444501 /* SwiGLU activation: hb = silu(gate) * up */
445502 swiglu_fused (state -> expert_hb , state -> expert_hb2 , expert_dim );
446503
447- tq_matmul_q4 (state -> expert_out , state -> expert_hb ,
448- ce -> down_q4_qs , ce -> down_q4_scales ,
449- hidden_dim , expert_dim );
504+ tq_matmul_gguf (state -> expert_out , state -> expert_hb ,
505+ ce -> down_q8 , TQ_GGML_TYPE_Q8_0 ,
506+ hidden_dim , expert_dim );
450507
451508 /* Weighted accumulation: output += weight * down_proj */
452509 for (int i = 0 ; i < hidden_dim ; i ++ )
0 commit comments