-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathdocker-compose.observability.yaml
More file actions
101 lines (95 loc) · 3 KB
/
Copy pathdocker-compose.observability.yaml
File metadata and controls
101 lines (95 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
services:
prometheus:
image: prom/prometheus:latest
container_name: scalarlm-prometheus
ports:
- "9090:9090"
volumes:
- ./deployment/observability/prometheus.yml:/etc/prometheus/prometheus.yml
- ./deployment/observability/alerts.yml:/etc/prometheus/alerts.yml
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
extra_hosts:
- "host.docker.internal:host-gateway"
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: scalarlm-grafana
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_ROOT_URL=http://localhost:3001
- GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/scalarlm-overview.json
volumes:
- ./deployment/observability/grafana/provisioning:/etc/grafana/provisioning
- ./deployment/observability/grafana/dashboards:/var/lib/grafana/dashboards
- grafana-data:/var/lib/grafana
depends_on:
- prometheus
- loki
- tempo
restart: unless-stopped
loki:
image: grafana/loki:latest
container_name: scalarlm-loki
ports:
- "3100:3100"
volumes:
- ./deployment/observability/loki-config.yml:/etc/loki/local-config.yaml
- loki-data:/loki
command: -config.file=/etc/loki/local-config.yaml
restart: unless-stopped
tempo:
image: grafana/tempo:latest
container_name: scalarlm-tempo
ports:
- "3200:3200" # Tempo HTTP
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
volumes:
- ./deployment/observability/tempo-config.yml:/etc/tempo/tempo.yaml
- tempo-data:/tmp/tempo
command: -config.file=/etc/tempo/tempo.yaml
restart: unless-stopped
otel-collector:
image: otel/opentelemetry-collector:latest
container_name: scalarlm-otel-collector
ports:
- "4317:4317" # OTLP gRPC (for ScalarLM to send traces)
- "4318:4318" # OTLP HTTP
volumes:
- ./deployment/observability/otel-collector.yml:/etc/otel-collector.yaml
command: --config=/etc/otel-collector.yaml
depends_on:
- prometheus
- loki
- tempo
restart: unless-stopped
# DCGM exporter - ONLY for NVIDIA GPU systems
# If you're running on CPU, comment out this entire service
# or use: docker-compose -f docker-compose.observability.yaml up -d --scale dcgm-exporter=0
dcgm-exporter:
image: nvcr.io/nvidia/k8s/dcgm-exporter:3.1.8-3.1.5-ubuntu20.04
container_name: scalarlm-dcgm-exporter
ports:
- "9400:9400"
runtime: nvidia
environment:
- DCGM_EXPORTER_LISTEN=:9400
- DCGM_EXPORTER_KUBERNETES=false
cap_add:
- SYS_ADMIN
restart: unless-stopped
profiles:
- gpu # Only start with --profile gpu
volumes:
prometheus-data:
grafana-data:
loki-data:
tempo-data: