From 0bf593831764ad610aa31b44f584665d618d5839 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 16 Jun 2026 02:48:22 +0200 Subject: [PATCH] Metal: protect tensor alloc/free counters with a mutex g_tensor_alloc_live_bytes and g_tensor_alloc_peak_bytes were plain uint64_t globals updated by ds4_gpu_tensor_alloc() and ds4_gpu_tensor_free() without any synchronisation. When multiple worker threads concurrently allocate or free Metal tensors (e.g. during concurrent session cleanup and new task startup) the unguarded read-modify-write can corrupt the counters and, on ARM64, the non-atomic 64-bit access can race. Add g_tensor_alloc_mu (PTHREAD_MUTEX_INITIALIZER) and take it around every update and the diagnostic snapshot read. The fprintf stays outside the lock; it uses a local snapshot so the lock is never held across an I/O call. Fixes: https://github.com/antirez/ds4/issues/404 Co-Authored-By: Claude Sonnet 4.6 --- ds4_metal.m | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/ds4_metal.m b/ds4_metal.m index 51b7b4982..692d3468a 100644 --- a/ds4_metal.m +++ b/ds4_metal.m @@ -179,6 +179,7 @@ static uint64_t g_model_mapped_max_tensor_bytes; static uint64_t g_tensor_alloc_live_bytes; static uint64_t g_tensor_alloc_peak_bytes; +static pthread_mutex_t g_tensor_alloc_mu = PTHREAD_MUTEX_INITIALIZER; static uint64_t g_model_wrap_count; static uint64_t g_model_wrap_bytes; static uint64_t g_model_wrap_max_bytes; @@ -2523,10 +2524,14 @@ void ds4_gpu_print_memory_report(const char *label) { fprintf(stderr, "ds4: Metal memory report%s%s\n", label && label[0] ? " " : "", label && label[0] ? label : ""); + pthread_mutex_lock(&g_tensor_alloc_mu); + uint64_t tensor_live_snap = g_tensor_alloc_live_bytes; + uint64_t tensor_peak_snap = g_tensor_alloc_peak_bytes; + pthread_mutex_unlock(&g_tensor_alloc_mu); fprintf(stderr, "ds4: runtime tensors live %.2f MiB peak %.2f MiB\n", - ds4_gpu_mib(g_tensor_alloc_live_bytes), - ds4_gpu_mib(g_tensor_alloc_peak_bytes)); + ds4_gpu_mib(tensor_live_snap), + ds4_gpu_mib(tensor_peak_snap)); ds4_gpu_print_task_memory_report(); fprintf(stderr, "ds4: mmap model wrapper spans %llu buffers %.2f GiB total, %.2f GiB max (not copied)\n", @@ -6044,16 +6049,20 @@ int ds4_gpu_init(void) { tensor.offset = 0; tensor.bytes = bytes; tensor.owner = 1; + pthread_mutex_lock(&g_tensor_alloc_mu); g_tensor_alloc_live_bytes += bytes; if (g_tensor_alloc_live_bytes > g_tensor_alloc_peak_bytes) { g_tensor_alloc_peak_bytes = g_tensor_alloc_live_bytes; } + uint64_t live_snap = g_tensor_alloc_live_bytes; + uint64_t peak_snap = g_tensor_alloc_peak_bytes; + pthread_mutex_unlock(&g_tensor_alloc_mu); if (ds4_gpu_trace_allocs()) { fprintf(stderr, "ds4: Metal tensor alloc %.3f MiB live %.3f MiB peak %.3f MiB\n", (double)bytes / (1024.0 * 1024.0), - (double)g_tensor_alloc_live_bytes / (1024.0 * 1024.0), - (double)g_tensor_alloc_peak_bytes / (1024.0 * 1024.0)); + (double)live_snap / (1024.0 * 1024.0), + (double)peak_snap / (1024.0 * 1024.0)); } return (__bridge_retained ds4_gpu_tensor *)tensor; } @@ -6092,17 +6101,21 @@ void ds4_gpu_tensor_free(ds4_gpu_tensor *tensor) { @autoreleasepool { DS4MetalTensor *obj = (__bridge_transfer DS4MetalTensor *)tensor; if (obj.owner) { + pthread_mutex_lock(&g_tensor_alloc_mu); if (obj.bytes <= g_tensor_alloc_live_bytes) { g_tensor_alloc_live_bytes -= obj.bytes; } else { g_tensor_alloc_live_bytes = 0; } + uint64_t live_snap = g_tensor_alloc_live_bytes; + uint64_t peak_snap = g_tensor_alloc_peak_bytes; + pthread_mutex_unlock(&g_tensor_alloc_mu); if (ds4_gpu_trace_allocs()) { fprintf(stderr, "ds4: Metal tensor free %.3f MiB live %.3f MiB peak %.3f MiB\n", (double)obj.bytes / (1024.0 * 1024.0), - (double)g_tensor_alloc_live_bytes / (1024.0 * 1024.0), - (double)g_tensor_alloc_peak_bytes / (1024.0 * 1024.0)); + (double)live_snap / (1024.0 * 1024.0), + (double)peak_snap / (1024.0 * 1024.0)); } } obj.buffer = nil;