@@ -903,7 +903,20 @@ int tq_metal_moe_forward(
903903 options: MTLResourceStorageModeShared ];
904904 if (!params_buf) return -1 ;
905905
906- /* --- Create command buffer and encoder --- */
906+ /* --- Create output buffer for Phase 3 (allocated once with other buffers) --- */
907+ size_t output_bytes = (size_t )hidden_dim * sizeof (float );
908+ id <MTLBuffer > output_buf = [tq_mtl_device newBufferWithLength: output_bytes
909+ options: MTLResourceStorageModeShared ];
910+ if (!output_buf) {
911+ /* Fallback to hybrid if buffer creation fails */
912+ memcpy (hb_output, [gate_buf contents ], inter_bytes);
913+ return 1 ;
914+ }
915+
916+ /* --- Single command buffer for all 3 phases (MLX pattern) ---
917+ * Metal guarantees sequential execution of compute encoders within
918+ * one command buffer. memoryBarrierWithScope ensures buffer writes
919+ * from one encoder are visible to the next. */
907920 id <MTLCommandBuffer > cmdBuf = [tq_mtl_queue commandBuffer ];
908921 if (!cmdBuf) return -1 ;
909922
@@ -933,56 +946,13 @@ int tq_metal_moe_forward(
933946 MTLSize gridSize = MTLSizeMake (n_tgs, 1 , 1 );
934947 MTLSize tgSize = MTLSizeMake (TQ_MATMUL_TG_SIZE , 1 , 1 );
935948 [enc dispatchThreadgroups: gridSize threadsPerThreadgroup: tgSize];
936- [enc endEncoding ];
937- }
938-
939- /* --- Phase 1: commit and wait to isolate hang --- */
940- [cmdBuf commit ];
941- [cmdBuf waitUntilCompleted ];
942-
943- if (cmdBuf.status == MTLCommandBufferStatusError ) {
944- NSLog (@" TurboQuant MoE: Phase 1 (gate+up) FAILED: %@ " , cmdBuf.error );
945- return -1 ;
946- }
947- NSLog (@" TurboQuant MoE: Phase 1 (gate+up) completed OK" );
948949
949- #ifdef TQ_MOE_DEBUG_VALIDATE
950- /* === Debug: compare GPU gate output for expert 0 vs CPU tq_matmul_gguf === */
951- {
952- /* tq_matmul_gguf declared in tq_gguf.h (already included) */
953- float * gpu_gate = (float *)[gate_buf contents ];
954- float * cpu_gate = (float *)malloc ((size_t )expert_dim * sizeof (float ));
955- if (cpu_gate) {
956- /* CPU matmul for expert 0's gate weights */
957- const uint8_t * gate_w = (const uint8_t *)weight_base + gate_offsets[0 ];
958- tq_ggml_dtype gt0 = gate_types_in ? (tq_ggml_dtype)gate_types_in[0 ]
959- : (tq_ggml_dtype)weight_type;
960- tq_matmul_gguf (cpu_gate, input, gate_w, gt0, expert_dim, hidden_dim);
961-
962- /* Compare first 8 and last 8 values */
963- NSLog (@" TurboQuant MoE DEBUG: gate expert 0 comparison (first 8):" );
964- float max_err = 0 .0f ;
965- for (int i = 0 ; i < expert_dim; i++) {
966- float err = fabsf (gpu_gate[i] - cpu_gate[i]);
967- if (err > max_err) max_err = err;
968- if (i < 8 || i >= expert_dim - 4 ) {
969- NSLog (@" [%d ] GPU=%.6f CPU=%.6f err=%.6f " , i, gpu_gate[i], cpu_gate[i], err);
970- }
971- }
972- NSLog (@" TurboQuant MoE DEBUG: gate max_err=%.6f across %d elements" , max_err, expert_dim);
973- if (max_err > 0 .01f ) {
974- NSLog (@" TurboQuant MoE DEBUG: *** MISMATCH DETECTED *** — weight offset or decoding bug" );
975- }
976- free (cpu_gate);
977- }
950+ /* Memory barrier: ensure gate_buf/up_buf writes visible to Phase 2 */
951+ [enc memoryBarrierWithScope: MTLBarrierScopeBuffers ];
952+ [enc endEncoding ];
978953 }
979- #endif /* TQ_MOE_DEBUG_VALIDATE */
980954
981- /* --- New command buffer for Phase 2 --- */
982- cmdBuf = [tq_mtl_queue commandBuffer ];
983- if (!cmdBuf) return -1 ;
984-
985- /* ======== Phase 2: SwiGLU ======== */
955+ /* ======== Phase 2: SwiGLU (reads gate_buf/up_buf from Phase 1) ======== */
986956 {
987957 id <MTLComputeCommandEncoder > enc = [cmdBuf computeCommandEncoder ];
988958 if (!enc) return -1 ;
@@ -998,40 +968,15 @@ int tq_metal_moe_forward(
998968 MTLSize gridSize = MTLSizeMake (n_tgs, 1 , 1 );
999969 MTLSize tgSize = MTLSizeMake (tg, 1 , 1 );
1000970 [enc dispatchThreadgroups: gridSize threadsPerThreadgroup: tgSize];
1001- [enc endEncoding ];
1002- }
1003-
1004- /* --- Phase 2: commit and wait to isolate hang --- */
1005- [cmdBuf commit ];
1006- [cmdBuf waitUntilCompleted ];
1007971
1008- if (cmdBuf. status == MTLCommandBufferStatusError ) {
1009- NSLog ( @" TurboQuant MoE: Phase 2 (SwiGLU) FAILED: %@ " , cmdBuf. error ) ;
1010- return - 1 ;
972+ /* Memory barrier: ensure gate_buf writes visible to Phase 3 */
973+ [enc memoryBarrierWithScope: MTLBarrierScopeBuffers ] ;
974+ [enc endEncoding ] ;
1011975 }
1012- NSLog (@" TurboQuant MoE: Phase 2 (SwiGLU) completed OK" );
1013976
1014977 /* ======== Phase 3: down projection + weighted accumulate (GPU) ========
1015- * Previously skipped due to IQ2_S shader hanging with constant array.
1016- * Now fixed: IQ2_S codebook passed as device buffer (buffer 4). */
978+ * IQ2_S codebook passed as device buffer (buffer 4). */
1017979 {
1018- /* Create output buffer for hidden_dim results */
1019- size_t output_bytes = (size_t )hidden_dim * sizeof (float );
1020- id <MTLBuffer > output_buf = [tq_mtl_device newBufferWithLength: output_bytes
1021- options: MTLResourceStorageModeShared ];
1022- if (!output_buf) {
1023- /* Fallback to hybrid if buffer creation fails */
1024- memcpy (hb_output, [gate_buf contents ], inter_bytes);
1025- return 1 ;
1026- }
1027-
1028- /* New command buffer for Phase 3 */
1029- cmdBuf = [tq_mtl_queue commandBuffer ];
1030- if (!cmdBuf) {
1031- memcpy (hb_output, [gate_buf contents ], inter_bytes);
1032- return 1 ;
1033- }
1034-
1035980 id <MTLComputeCommandEncoder > enc = [cmdBuf computeCommandEncoder ];
1036981 if (!enc) {
1037982 memcpy (hb_output, [gate_buf contents ], inter_bytes);
@@ -1057,26 +1002,26 @@ int tq_metal_moe_forward(
10571002 MTLSize tgSize3 = MTLSizeMake (TQ_MATMUL_TG_SIZE , 1 , 1 );
10581003 [enc dispatchThreadgroups: gridSize3 threadsPerThreadgroup: tgSize3];
10591004 [enc endEncoding ];
1005+ }
10601006
1061- [cmdBuf commit ];
1062- [cmdBuf waitUntilCompleted ];
1007+ /* ONE commit + wait for all 3 phases */
1008+ [cmdBuf commit ];
1009+ [cmdBuf waitUntilCompleted ];
10631010
1064- if (cmdBuf.status == MTLCommandBufferStatusError ) {
1065- NSLog (@" TurboQuant MoE: Phase 3 (down+accum) FAILED: %@ " , cmdBuf.error );
1066- /* Fallback to hybrid on failure */
1067- memcpy (hb_output, [gate_buf contents ], inter_bytes);
1068- return 1 ;
1069- }
1070- NSLog (@" TurboQuant MoE: Phase 3 (down+accum) completed OK" );
1011+ if (cmdBuf.status == MTLCommandBufferStatusError ) {
1012+ NSLog (@" TurboQuant MoE: GPU dispatch FAILED: %@ " , cmdBuf.error );
1013+ /* Fallback to hybrid on failure */
1014+ memcpy (hb_output, [gate_buf contents ], inter_bytes);
1015+ return 1 ;
1016+ }
10711017
1072- /* Copy result to output */
1073- memcpy (output, [output_buf contents ], output_bytes);
1018+ /* Copy result to output */
1019+ memcpy (output, [output_buf contents ], output_bytes);
10741020
1075- /* Also copy hb for potential caller use */
1076- memcpy (hb_output, [gate_buf contents ], inter_bytes);
1021+ /* Also copy hb for potential caller use */
1022+ memcpy (hb_output, [gate_buf contents ], inter_bytes);
10771023
1078- return 0 ; /* Full GPU success */
1079- }
1024+ return 0 ; /* Full GPU success */
10801025 }
10811026}
10821027
0 commit comments