ScalarLM/docker-compose.observability.yaml at main · tensorwavecloud/ScalarLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
services:
  prometheus:
    image: prom/prometheus:latest
    container_name: scalarlm-prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./deployment/observability/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./deployment/observability/alerts.yml:/etc/prometheus/alerts.yml
      - prometheus-data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=15d'
      - '--web.enable-lifecycle'
    extra_hosts:
      - "host.docker.internal:host-gateway"
    restart: unless-stopped

  grafana:
    image: grafana/grafana:latest
    container_name: scalarlm-grafana
    ports:
      - "3001:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_SERVER_ROOT_URL=http://localhost:3001
      - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/scalarlm-overview.json
    volumes:
      - ./deployment/observability/grafana/provisioning:/etc/grafana/provisioning
      - ./deployment/observability/grafana/dashboards:/var/lib/grafana/dashboards
      - grafana-data:/var/lib/grafana
    depends_on:
      - prometheus
      - loki
      - tempo
    restart: unless-stopped

  loki:
    image: grafana/loki:latest
    container_name: scalarlm-loki
    ports:
      - "3100:3100"
    volumes:
      - ./deployment/observability/loki-config.yml:/etc/loki/local-config.yaml
      - loki-data:/loki
    command: -config.file=/etc/loki/local-config.yaml
    restart: unless-stopped

  tempo:
    image: grafana/tempo:latest
    container_name: scalarlm-tempo
    ports:
      - "3200:3200"   # Tempo HTTP
      - "4317:4317"   # OTLP gRPC
      - "4318:4318"   # OTLP HTTP
    volumes:
      - ./deployment/observability/tempo-config.yml:/etc/tempo/tempo.yaml
      - tempo-data:/tmp/tempo
    command: -config.file=/etc/tempo/tempo.yaml
    restart: unless-stopped

  otel-collector:
    image: otel/opentelemetry-collector:latest
    container_name: scalarlm-otel-collector
    ports:
      - "4317:4317"   # OTLP gRPC (for ScalarLM to send traces)
      - "4318:4318"   # OTLP HTTP
    volumes:
      - ./deployment/observability/otel-collector.yml:/etc/otel-collector.yaml
    command: --config=/etc/otel-collector.yaml
    depends_on:
      - prometheus
      - loki
      - tempo
    restart: unless-stopped

  # DCGM exporter - ONLY for NVIDIA GPU systems
  # If you're running on CPU, comment out this entire service
  # or use: docker-compose -f docker-compose.observability.yaml up -d --scale dcgm-exporter=0
  dcgm-exporter:
    image: nvcr.io/nvidia/k8s/dcgm-exporter:3.1.8-3.1.5-ubuntu20.04
    container_name: scalarlm-dcgm-exporter
    ports:
      - "9400:9400"
    runtime: nvidia
    environment:
      - DCGM_EXPORTER_LISTEN=:9400
      - DCGM_EXPORTER_KUBERNETES=false
    cap_add:
      - SYS_ADMIN
    restart: unless-stopped
    profiles:
      - gpu  # Only start with --profile gpu

volumes:
  prometheus-data:
  grafana-data:
  loki-data:
  tempo-data: