From 0bf593831764ad610aa31b44f584665d618d5839 Mon Sep 17 00:00:00 2001
From: root <root@localhost.localdomain>
Date: Tue, 16 Jun 2026 02:48:22 +0200
Subject: [PATCH] Metal: protect tensor alloc/free counters with a mutex

g_tensor_alloc_live_bytes and g_tensor_alloc_peak_bytes were plain
uint64_t globals updated by ds4_gpu_tensor_alloc() and ds4_gpu_tensor_free()
without any synchronisation.  When multiple worker threads concurrently
allocate or free Metal tensors (e.g. during concurrent session cleanup
and new task startup) the unguarded read-modify-write can corrupt the
counters and, on ARM64, the non-atomic 64-bit access can race.

Add g_tensor_alloc_mu (PTHREAD_MUTEX_INITIALIZER) and take it around
every update and the diagnostic snapshot read.  The fprintf stays
outside the lock; it uses a local snapshot so the lock is never held
across an I/O call.

Fixes: https://github.com/antirez/ds4/issues/404

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ds4_metal.m | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/ds4_metal.m b/ds4_metal.m
index 51b7b4982..692d3468a 100644
--- a/ds4_metal.m
+++ b/ds4_metal.m
@@ -179,6 +179,7 @@
 static uint64_t g_model_mapped_max_tensor_bytes;
 static uint64_t g_tensor_alloc_live_bytes;
 static uint64_t g_tensor_alloc_peak_bytes;
+static pthread_mutex_t g_tensor_alloc_mu = PTHREAD_MUTEX_INITIALIZER;
 static uint64_t g_model_wrap_count;
 static uint64_t g_model_wrap_bytes;
 static uint64_t g_model_wrap_max_bytes;
@@ -2523,10 +2524,14 @@ void ds4_gpu_print_memory_report(const char *label) {
     fprintf(stderr, "ds4: Metal memory report%s%s\n",
             label && label[0] ? " " : "",
             label && label[0] ? label : "");
+    pthread_mutex_lock(&g_tensor_alloc_mu);
+    uint64_t tensor_live_snap = g_tensor_alloc_live_bytes;
+    uint64_t tensor_peak_snap = g_tensor_alloc_peak_bytes;
+    pthread_mutex_unlock(&g_tensor_alloc_mu);
     fprintf(stderr,
             "ds4:   runtime tensors live %.2f MiB peak %.2f MiB\n",
-            ds4_gpu_mib(g_tensor_alloc_live_bytes),
-            ds4_gpu_mib(g_tensor_alloc_peak_bytes));
+            ds4_gpu_mib(tensor_live_snap),
+            ds4_gpu_mib(tensor_peak_snap));
     ds4_gpu_print_task_memory_report();
     fprintf(stderr,
             "ds4:   mmap model wrapper spans %llu buffers %.2f GiB total, %.2f GiB max (not copied)\n",
@@ -6044,16 +6049,20 @@ int ds4_gpu_init(void) {
         tensor.offset = 0;
         tensor.bytes = bytes;
         tensor.owner = 1;
+        pthread_mutex_lock(&g_tensor_alloc_mu);
         g_tensor_alloc_live_bytes += bytes;
         if (g_tensor_alloc_live_bytes > g_tensor_alloc_peak_bytes) {
             g_tensor_alloc_peak_bytes = g_tensor_alloc_live_bytes;
         }
+        uint64_t live_snap = g_tensor_alloc_live_bytes;
+        uint64_t peak_snap = g_tensor_alloc_peak_bytes;
+        pthread_mutex_unlock(&g_tensor_alloc_mu);
         if (ds4_gpu_trace_allocs()) {
             fprintf(stderr,
                     "ds4: Metal tensor alloc %.3f MiB live %.3f MiB peak %.3f MiB\n",
                     (double)bytes / (1024.0 * 1024.0),
-                    (double)g_tensor_alloc_live_bytes / (1024.0 * 1024.0),
-                    (double)g_tensor_alloc_peak_bytes / (1024.0 * 1024.0));
+                    (double)live_snap / (1024.0 * 1024.0),
+                    (double)peak_snap / (1024.0 * 1024.0));
         }
         return (__bridge_retained ds4_gpu_tensor *)tensor;
     }
@@ -6092,17 +6101,21 @@ void ds4_gpu_tensor_free(ds4_gpu_tensor *tensor) {
     @autoreleasepool {
         DS4MetalTensor *obj = (__bridge_transfer DS4MetalTensor *)tensor;
         if (obj.owner) {
+            pthread_mutex_lock(&g_tensor_alloc_mu);
             if (obj.bytes <= g_tensor_alloc_live_bytes) {
                 g_tensor_alloc_live_bytes -= obj.bytes;
             } else {
                 g_tensor_alloc_live_bytes = 0;
             }
+            uint64_t live_snap = g_tensor_alloc_live_bytes;
+            uint64_t peak_snap = g_tensor_alloc_peak_bytes;
+            pthread_mutex_unlock(&g_tensor_alloc_mu);
             if (ds4_gpu_trace_allocs()) {
                 fprintf(stderr,
                         "ds4: Metal tensor free %.3f MiB live %.3f MiB peak %.3f MiB\n",
                         (double)obj.bytes / (1024.0 * 1024.0),
-                        (double)g_tensor_alloc_live_bytes / (1024.0 * 1024.0),
-                        (double)g_tensor_alloc_peak_bytes / (1024.0 * 1024.0));
+                        (double)live_snap / (1024.0 * 1024.0),
+                        (double)peak_snap / (1024.0 * 1024.0));
             }
         }
         obj.buffer = nil;