diff --git a/README.md b/README.md index ae07bae..4b1c518 100644 --- a/README.md +++ b/README.md @@ -188,9 +188,21 @@ go build -o kagent-tools . The server runs using sse transport for MCP communication. +#### CLI Flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--port`, `-p` | `8084` | Port to run the MCP server on | +| `--metrics-port` | `8084` | Port to run the Prometheus metrics server on | +| `--stdio` | `false` | Use stdio for communication instead of HTTP | +| `--tools` | `[]` (all) | Comma-separated list of tool providers to register | +| `--read-only` | `false` | Disable tools that perform write operations | +| `--kubeconfig` | `""` | Path to kubeconfig file (defaults to in-cluster config) | +| `--version`, `-v` | `false` | Show version information and exit | + ### Testing ```bash -go test -v +go test -v ./... ``` ## Tool Implementation Details @@ -243,6 +255,25 @@ Tools can be configured through environment variables: - `GRAFANA_URL`: Default Grafana server URL - `GRAFANA_API_KEY`: Default Grafana API key +## Observability + +The MCP server exposes Prometheus metrics on a configurable HTTP endpoint (`/metrics`). By default, the metrics endpoint runs on the same port as the MCP server. To run it on a separate port: + +```bash +./kagent-tools --port 8084 --metrics-port 9090 +``` + +### Exposed Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `kagent_tools_mcp_server_info` | Gauge | `server_name`, `version`, `git_commit`, `build_date`, `server_mode` | Server metadata (always set to 1) | +| `kagent_tools_mcp_registered_tools` | Gauge | `tool_name`, `tool_provider` | Set to 1 for each registered tool | +| `kagent_tools_mcp_invocations_total` | Counter | `tool_name`, `tool_provider` | Total number of tool invocations | +| `kagent_tools_mcp_invocations_failure_total` | Counter | `tool_name`, `tool_provider` | Total number of failed tool invocations | + +Standard Go runtime and process metrics are also included (goroutines, memory, CPU, file descriptors, etc.). + ## Error Handling and Debugging The tools provide detailed error messages and support verbose output. When debugging issues: @@ -258,9 +289,8 @@ Potential areas for future improvement: 1. **Native Client Libraries**: Replace CLI calls with native Go client libraries where possible 2. **Advanced Documentation Search**: Implement full vector search for documentation queries 3. **Caching**: Add caching for frequently accessed data -4. **Metrics and Observability**: Add metrics and tracing for tool usage -5. **Configuration Management**: Enhanced configuration management and validation -6. **Parallel Execution**: Support for parallel execution of related operations +4. **Configuration Management**: Enhanced configuration management and validation +5. **Parallel Execution**: Support for parallel execution of related operations ## Contributing diff --git a/cmd/main.go b/cmd/main.go index 374d7ee..943b7db 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -8,6 +8,7 @@ import ( "os" "os/signal" "runtime" + "strconv" "strings" "sync" "syscall" @@ -15,6 +16,7 @@ import ( "github.com/joho/godotenv" "github.com/kagent-dev/tools/internal/logger" + "github.com/kagent-dev/tools/internal/metrics" "github.com/kagent-dev/tools/internal/telemetry" "github.com/kagent-dev/tools/internal/version" "github.com/kagent-dev/tools/pkg/argo" @@ -25,16 +27,19 @@ import ( "github.com/kagent-dev/tools/pkg/kubescape" "github.com/kagent-dev/tools/pkg/prometheus" "github.com/kagent-dev/tools/pkg/utils" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/spf13/cobra" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" + "github.com/mark3labs/mcp-go/mcp" "github.com/mark3labs/mcp-go/server" ) var ( port int + metricsPort int stdio bool tools []string kubeconfig *string @@ -56,6 +61,7 @@ var rootCmd = &cobra.Command{ func init() { rootCmd.Flags().IntVarP(&port, "port", "p", 8084, "Port to run the server on") + rootCmd.Flags().IntVarP(&metricsPort, "metrics-port", "m", 0, "Port to run the metrics server on (default 0: same as --port)") rootCmd.Flags().BoolVar(&stdio, "stdio", false, "Use stdio for communication instead of HTTP") rootCmd.Flags().StringSliceVar(&tools, "tools", []string{}, "List of tools to register. If empty, all tools are registered.") rootCmd.Flags().BoolVarP(&showVersion, "version", "v", false, "Show version information and exit") @@ -92,6 +98,11 @@ func run(cmd *cobra.Command, args []string) { return } + // 0 means "same as --port" - resolve it before any server logic uses it + if metricsPort == 0 { + metricsPort = port + } + logger.Init(stdio) defer logger.Sync() @@ -134,8 +145,11 @@ func run(cmd *cobra.Command, args []string) { Version, ) - // Register tools - registerMCP(mcp, tools, *kubeconfig, readOnly) + // Register tools and wrap handlers with metrics instrumentation. + // registerMCP returns a map of tool_name -> tool_provider so that + // wrapToolHandlersWithMetrics knows which provider each tool belongs to. + toolProviders := registerMCP(mcp, tools, *kubeconfig, readOnly) + wrapToolHandlersWithMetrics(mcp, toolProviders) // Create wait group for server goroutines var wg sync.WaitGroup @@ -146,6 +160,7 @@ func run(cmd *cobra.Command, args []string) { // HTTP server reference (only used when not in stdio mode) var httpServer *http.Server + var metricsServer *http.Server // Separate server for metrics if metricsPort is different from main port // Start server based on chosen mode wg.Add(1) @@ -170,17 +185,40 @@ func run(cmd *cobra.Command, args []string) { } }) - // Add metrics endpoint (basic implementation for e2e tests) - mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "text/plain") - w.WriteHeader(http.StatusOK) - - // Generate real runtime metrics instead of hardcoded values - metrics := generateRuntimeMetrics() - if err := writeResponse(w, []byte(metrics)); err != nil { - logger.Get().Error("Failed to write metrics response", "error", err) + // Add metrics endpoint + registry := metrics.InitServer() // Initialize Prometheus metrics before starting the server + + if metricsPort != port { // Only start a separate metrics server if the metrics port is different from the main server port + // Create the metrics server outside the goroutine to avoid a race condition + // between the goroutine assigning metricsServer and the shutdown handler reading it + metricsMux := http.NewServeMux() + metricsMux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{})) + metricsServer = &http.Server{ + Addr: fmt.Sprintf(":%d", metricsPort), + Handler: metricsMux, } - }) + + wg.Add(1) + go func() { + defer wg.Done() + logger.Get().Info("Starting Prometheus metrics endpoint on /metrics", "port", strconv.Itoa(metricsPort)) + if err := metricsServer.ListenAndServe(); err != nil { + if !errors.Is(err, http.ErrServerClosed) { + logger.Get().Error("Metrics endpoint failed", "error", err) + } else { + logger.Get().Info("Metrics server closed gracefully.") + } + } + }() + } else { + logger.Get().Info("Starting Prometheus metrics endpoint on /metrics", "port", strconv.Itoa(port)) + mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{})) + } + serverMode := "read-write" + if readOnly { + serverMode = "read-only" + } + metrics.KagentToolsMCPServerInfo.WithLabelValues(Name, Version, GitCommit, BuildDate, serverMode).Set(1) // Handle all other routes with the MCP server wrapped in telemetry middleware mux.Handle("/", telemetry.HTTPMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -229,6 +267,19 @@ func run(cmd *cobra.Command, args []string) { rootSpan.AddEvent("server.shutdown.completed") } } + + // Gracefully shutdown metrics server if running separately + if !stdio && metricsServer != nil { + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + + if err := metricsServer.Shutdown(shutdownCtx); err != nil { + logger.Get().Error("Failed to shutdown metrics server gracefully", "error", err) + rootSpan.RecordError(err) + } else { + logger.Get().Info("Metrics server shutdown completed") + } + } }() // Wait for all server operations to complete @@ -242,47 +293,6 @@ func writeResponse(w http.ResponseWriter, data []byte) error { return err } -// generateRuntimeMetrics generates real runtime metrics for the /metrics endpoint -func generateRuntimeMetrics() string { - var m runtime.MemStats - runtime.ReadMemStats(&m) - - now := time.Now().Unix() - - // Build metrics in Prometheus format - metrics := strings.Builder{} - - // Go runtime info - metrics.WriteString("# HELP go_info Information about the Go environment.\n") - metrics.WriteString("# TYPE go_info gauge\n") - metrics.WriteString(fmt.Sprintf("go_info{version=\"%s\"} 1\n", runtime.Version())) - - // Process start time - metrics.WriteString("# HELP process_start_time_seconds Start time of the process since unix epoch in seconds.\n") - metrics.WriteString("# TYPE process_start_time_seconds gauge\n") - metrics.WriteString(fmt.Sprintf("process_start_time_seconds %d\n", now)) - - // Memory metrics - metrics.WriteString("# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.\n") - metrics.WriteString("# TYPE go_memstats_alloc_bytes gauge\n") - metrics.WriteString(fmt.Sprintf("go_memstats_alloc_bytes %d\n", m.Alloc)) - - metrics.WriteString("# HELP go_memstats_total_alloc_bytes Total number of bytes allocated, even if freed.\n") - metrics.WriteString("# TYPE go_memstats_total_alloc_bytes counter\n") - metrics.WriteString(fmt.Sprintf("go_memstats_total_alloc_bytes %d\n", m.TotalAlloc)) - - metrics.WriteString("# HELP go_memstats_sys_bytes Number of bytes obtained from system.\n") - metrics.WriteString("# TYPE go_memstats_sys_bytes gauge\n") - metrics.WriteString(fmt.Sprintf("go_memstats_sys_bytes %d\n", m.Sys)) - - // Goroutine count - metrics.WriteString("# HELP go_goroutines Number of goroutines that currently exist.\n") - metrics.WriteString("# TYPE go_goroutines gauge\n") - metrics.WriteString(fmt.Sprintf("go_goroutines %d\n", runtime.NumGoroutine())) - - return metrics.String() -} - func runStdioServer(ctx context.Context, mcp *server.MCPServer) { logger.Get().Info("Running KAgent Tools Server STDIO:", "tools", strings.Join(tools, ",")) stdioServer := server.NewStdioServer(mcp) @@ -291,7 +301,11 @@ func runStdioServer(ctx context.Context, mcp *server.MCPServer) { } } -func registerMCP(mcp *server.MCPServer, enabledToolProviders []string, kubeconfig string, readOnly bool) { +// registerMCP registers tool providers with the MCP server and returns a mapping +// of tool_name -> tool_provider. This mapping is built using the ListTools() diff +// technique: we snapshot the tool list before and after each provider registers, +// so we know exactly which tools belong to which provider. +func registerMCP(mcp *server.MCPServer, enabledToolProviders []string, kubeconfig string, readOnly bool) map[string]string { // A map to hold tool providers and their registration functions toolProviderMap := map[string]func(*server.MCPServer){ "argo": func(s *server.MCPServer) { argo.RegisterTools(s, readOnly) }, @@ -310,11 +324,83 @@ func registerMCP(mcp *server.MCPServer, enabledToolProviders []string, kubeconfi enabledToolProviders = append(enabledToolProviders, name) } } + + // toolToProvider maps each tool name to its provider (e.g., "kubectl_get" -> "k8s"). + // This is used later by wrapToolHandlersWithMetrics to set the correct tool_provider label. + toolToProvider := make(map[string]string) + for _, toolProviderName := range enabledToolProviders { if registerFunc, ok := toolProviderMap[toolProviderName]; ok { + // Snapshot the tool list before this provider registers its tools. + // We need this because ListTools() returns ALL tools from ALL providers, + // so the only way to know which tools belong to THIS provider is to compare + // the list before and after registration. + toolsBefore := mcp.ListTools() + registerFunc(mcp) + + // Determine which tools were just registered by this provider + // by finding tools that exist now but didn't exist before. + // Record each one in Prometheus so we can observe the full tool inventory. + for toolName := range mcp.ListTools() { + if _, existed := toolsBefore[toolName]; !existed { + metrics.KagentToolsMCPRegisteredTools.WithLabelValues(toolName, toolProviderName).Set(1) + toolToProvider[toolName] = toolProviderName + } + } } else { logger.Get().Error("Unknown tool specified", "provider", toolProviderName) } } + + return toolToProvider +} + +// wrapToolHandlersWithMetrics applies the wrapper/middleware pattern to instrument +// all registered MCP tool handlers with Prometheus invocation counters. +// +// How it works: +// 1. Grab all registered tools from the MCP server using ListTools() +// 2. For each tool, wrap its handler with a function that increments metrics +// 3. Replace all tools in the MCP server using SetTools() +// +// The wrapper function: +// - Increments kagent_tools_mcp_invocations_total on every call +// - Increments kagent_tools_mcp_invocations_failure_total when the handler returns a +// non-nil Go error OR when result.IsError is true (the MCP convention for tool-level +// failures - handlers return NewToolResultError(...), nil, not a Go error) +// - Calls the original handler unchanged - the tool's behaviour is not affected +// +// This uses the standard middleware/decorator pattern: the original handler and the +// wrapped handler have the same function signature, so they are interchangeable. +// No changes are required in any pkg/ file - all instrumentation happens centrally here. +func wrapToolHandlersWithMetrics(mcpServer *server.MCPServer, toolToProvider map[string]string) { + allTools := mcpServer.ListTools() + wrapped := make([]server.ServerTool, 0, len(allTools)) + + for name, st := range allTools { + originalHandler := st.Handler + toolName := name // capture for closure + provider := toolToProvider[toolName] + + wrapped = append(wrapped, server.ServerTool{ + Tool: st.Tool, + Handler: func(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + metrics.KagentToolsMCPInvocationsTotal.WithLabelValues(toolName, provider).Inc() + + result, err := originalHandler(ctx, req) + + // Count as failure if the Go error is non-nil OR if the tool returned + // a result with IsError=true (the MCP convention for tool-level failures, + // which always return nil for the Go error). + if err != nil || (result != nil && result.IsError) { + metrics.KagentToolsMCPInvocationsFailureTotal.WithLabelValues(toolName, provider).Inc() + } + + return result, err + }, + }) + } + + mcpServer.SetTools(wrapped...) } diff --git a/cmd/metrics_wrap_test.go b/cmd/metrics_wrap_test.go new file mode 100644 index 0000000..0b8ca73 --- /dev/null +++ b/cmd/metrics_wrap_test.go @@ -0,0 +1,127 @@ +package main + +import ( + "context" + "fmt" + "testing" + + "github.com/kagent-dev/tools/internal/metrics" + "github.com/mark3labs/mcp-go/mcp" + "github.com/mark3labs/mcp-go/server" + promtest "github.com/prometheus/client_golang/prometheus/testutil" +) + +// newTestServer creates a fresh MCP server and resets the metric counters so +// tests do not interfere with each other. +func newTestServer() *server.MCPServer { + metrics.KagentToolsMCPInvocationsTotal.Reset() + metrics.KagentToolsMCPInvocationsFailureTotal.Reset() + return server.NewMCPServer("test-server", "test") +} + +// invokeWrapped registers handler on s, wraps all handlers with metrics, then +// calls the wrapped handler for toolName and returns its result. +func invokeWrapped(t *testing.T, s *server.MCPServer, toolName string, provider string, handler server.ToolHandlerFunc) (*mcp.CallToolResult, error) { + t.Helper() + s.AddTool(mcp.Tool{Name: toolName}, handler) + wrapToolHandlersWithMetrics(s, map[string]string{toolName: provider}) + st, ok := s.ListTools()[toolName] + if !ok { + t.Fatalf("tool %q not found after wrapping", toolName) + } + return st.Handler(context.Background(), mcp.CallToolRequest{}) +} + +// TestWrapToolHandlersWithMetrics_IsErrorIncrementsFailureCounter is the +// critical regression test for the bug identified in PR review: +// +// Handlers signal tool-level failures via NewToolResultError(...), nil +// (result.IsError=true, Go error=nil), so checking only `err != nil` would +// never count these as failures. +// +// To replicate manually: +// +// go test -v -run TestWrapToolHandlersWithMetrics_IsErrorIncrementsFailureCounter ./cmd/ +func TestWrapToolHandlersWithMetrics_IsErrorIncrementsFailureCounter(t *testing.T) { + s := newTestServer() + + result, err := invokeWrapped(t, s, "failing_tool", "test", + func(_ context.Context, _ mcp.CallToolRequest) (*mcp.CallToolResult, error) { + // This is the pattern used 214 times across pkg/ - returns a tool-level + // error with IsError=true but a nil Go error. + return mcp.NewToolResultError("kubectl: resource not found"), nil + }, + ) + + if err != nil { + t.Fatalf("expected nil Go error from handler, got: %v", err) + } + if !result.IsError { + t.Fatal("expected result.IsError=true") + } + + total := promtest.ToFloat64(metrics.KagentToolsMCPInvocationsTotal.WithLabelValues("failing_tool", "test")) + if total != 1 { + t.Errorf("invocations_total: expected 1, got %v", total) + } + + failures := promtest.ToFloat64(metrics.KagentToolsMCPInvocationsFailureTotal.WithLabelValues("failing_tool", "test")) + if failures != 1 { + t.Errorf("invocations_failure_total: expected 1, got %v (IsError=true was not counted as failure)", failures) + } +} + +// TestWrapToolHandlersWithMetrics_SuccessDoesNotIncrementFailureCounter verifies +// that a successful tool call does not touch the failure counter. +// +// To replicate manually: +// +// go test -v -run TestWrapToolHandlersWithMetrics_SuccessDoesNotIncrementFailureCounter ./cmd/ +func TestWrapToolHandlersWithMetrics_SuccessDoesNotIncrementFailureCounter(t *testing.T) { + s := newTestServer() + + _, err := invokeWrapped(t, s, "success_tool", "test", + func(_ context.Context, _ mcp.CallToolRequest) (*mcp.CallToolResult, error) { + return mcp.NewToolResultText("all good"), nil + }, + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + total := promtest.ToFloat64(metrics.KagentToolsMCPInvocationsTotal.WithLabelValues("success_tool", "test")) + if total != 1 { + t.Errorf("invocations_total: expected 1, got %v", total) + } + + failures := promtest.ToFloat64(metrics.KagentToolsMCPInvocationsFailureTotal.WithLabelValues("success_tool", "test")) + if failures != 0 { + t.Errorf("invocations_failure_total: expected 0 for a successful call, got %v", failures) + } +} + +// TestWrapToolHandlersWithMetrics_GoErrorIncrementsFailureCounter verifies +// that a real Go error (e.g. infrastructure failure) is also counted. +// +// To replicate manually: +// +// go test -v -run TestWrapToolHandlersWithMetrics_GoErrorIncrementsFailureCounter ./cmd/ +func TestWrapToolHandlersWithMetrics_GoErrorIncrementsFailureCounter(t *testing.T) { + s := newTestServer() + + _, err := invokeWrapped(t, s, "broken_tool", "test", + func(_ context.Context, _ mcp.CallToolRequest) (*mcp.CallToolResult, error) { + return nil, fmt.Errorf("connection refused") + }, + ) + + if err == nil { + t.Fatal("expected a Go error, got nil") + } + + failures := promtest.ToFloat64(metrics.KagentToolsMCPInvocationsFailureTotal.WithLabelValues("broken_tool", "test")) + if failures != 1 { + t.Errorf("invocations_failure_total: expected 1 for Go error, got %v", failures) + } +} diff --git a/dashboard/grafana-dash-example.png b/dashboard/grafana-dash-example.png new file mode 100644 index 0000000..6ffe311 Binary files /dev/null and b/dashboard/grafana-dash-example.png differ diff --git a/dashboard/grafana-dashboard.json b/dashboard/grafana-dashboard.json new file mode 100644 index 0000000..801a052 --- /dev/null +++ b/dashboard/grafana-dashboard.json @@ -0,0 +1,819 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 29, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^version$/", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "kagent_tools_mcp_server_info", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Server Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": 0 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count(kagent_tools_mcp_registered_tools)", + "instant": true, + "legendFormat": "Registered Tools", + "range": false, + "refId": "A" + } + ], + "title": "Total Registered Tools", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(increase(kagent_tools_mcp_invocations_total[5m]))", + "instant": true, + "legendFormat": "Total Invocations (5m)", + "range": false, + "refId": "A" + } + ], + "title": "Invocations (Last 5m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "100 - (sum(rate(kagent_tools_mcp_invocations_failure_total[5m])) / sum(rate(kagent_tools_mcp_invocations_total[5m])) * 100)", + "instant": true, + "legendFormat": "Success Rate", + "range": false, + "refId": "A" + } + ], + "title": "Success Rate", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(kagent_tools_mcp_invocations_total[$__rate_interval])) by (tool_provider)", + "legendFormat": "{{tool_provider}}", + "range": true, + "refId": "A" + } + ], + "title": "Invocation Rate by Provider", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(kagent_tools_mcp_invocations_total[$__rate_interval]))", + "legendFormat": "Total", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(kagent_tools_mcp_invocations_failure_total[$__rate_interval]))", + "hide": false, + "legendFormat": "Failures", + "range": true, + "refId": "B" + } + ], + "title": "Total Invocations vs Failures", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "sort": "desc", + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by(tool_provider) (kagent_tools_mcp_registered_tools)", + "legendFormat": "{{tool_provider}}", + "range": true, + "refId": "A" + } + ], + "title": "Tools by Provider", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "footer": { + "reducers": [] + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Invocations" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + }, + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + }, + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "thresholds" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 10 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 12 + }, + "id": 8, + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Invocations" + } + ] + }, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(tool_name, tool_provider) (kagent_tools_mcp_invocations_total)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(tool_name, tool_provider) (kagent_tools_mcp_invocations_failure_total)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "B" + } + ], + "title": "Top Invoked Tools", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "tool_name" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "tool_provider 2": true + }, + "includeByName": {}, + "indexByName": { + "Time 1": 4, + "Time 2": 5, + "Value #A": 2, + "Value #B": 3, + "tool_name": 0, + "tool_provider 1": 1, + "tool_provider 2": 6 + }, + "renameByName": { + "Value #A": "Invocations", + "Value #B": "Failures", + "tool_name": "Tool Name", + "tool_provider 1": "Provider" + } + } + } + ], + "type": "table" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 42, + "tags": [ + "kagent", + "mcp", + "tools" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "prometheus" + }, + "includeAll": false, + "label": "Datasource", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "kAgent Tools - MCP Observability", + "uid": "kagent-tools-mcp", + "version": 1 +} \ No newline at end of file diff --git a/go.mod b/go.mod index a2f27fc..e796d12 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,8 @@ require ( github.com/mark3labs/mcp-go v0.43.2 github.com/onsi/ginkgo/v2 v2.27.2 github.com/onsi/gomega v1.38.2 + github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/client_model v0.6.2 github.com/spf13/cobra v1.10.2 github.com/stretchr/testify v1.11.1 github.com/tmc/langchaingo v0.1.14 @@ -109,6 +111,7 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.18.4 // indirect github.com/kubescape/go-logger v0.0.26 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mackerelio/go-osstat v0.2.6 // indirect github.com/mailru/easyjson v0.9.1 // indirect github.com/mattn/go-colorable v0.1.14 // indirect @@ -129,8 +132,6 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pkoukk/tiktoken-go v0.1.8 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.23.2 // indirect - github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.19.2 // indirect github.com/sagikazarmark/locafero v0.12.0 // indirect diff --git a/helm/kagent-tools/templates/deployment.yaml b/helm/kagent-tools/templates/deployment.yaml index 001caef..9787694 100644 --- a/helm/kagent-tools/templates/deployment.yaml +++ b/helm/kagent-tools/templates/deployment.yaml @@ -59,6 +59,8 @@ spec: args: - "--port" - "{{ .Values.service.ports.tools.targetPort }}" + - "--metrics-port" + - "{{ .Values.tools.metrics.port | default .Values.service.ports.tools.targetPort }}" {{- if .Values.tools.enabledTools }} - "--tools={{ join "," .Values.tools.enabledTools }}" {{- end }} @@ -98,6 +100,9 @@ spec: - name: http-tools containerPort: {{ .Values.service.ports.tools.targetPort }} protocol: TCP + - name: http-metrics + containerPort: {{ .Values.tools.metrics.port | default .Values.service.ports.tools.targetPort }} + protocol: TCP readinessProbe: tcpSocket: port: http-tools diff --git a/helm/kagent-tools/templates/service.yaml b/helm/kagent-tools/templates/service.yaml index 55c7fd2..f578670 100644 --- a/helm/kagent-tools/templates/service.yaml +++ b/helm/kagent-tools/templates/service.yaml @@ -19,3 +19,22 @@ spec: name: tools selector: {{- include "kagent.selectorLabels" . | nindent 4 }} + +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "kagent.fullname" . }}-metrics + namespace: {{ include "kagent.namespace" . }} + labels: + {{- include "kagent.labels" . | nindent 4 }} + app.kubernetes.io/component: metrics +spec: + selector: + {{- include "kagent.selectorLabels" . | nindent 4 }} + ports: + - name: prometheus-metrics + protocol: TCP + port: {{ .Values.tools.metrics.port | default .Values.service.ports.tools.targetPort }} + targetPort: {{ .Values.tools.metrics.port | default .Values.service.ports.tools.targetPort }} + \ No newline at end of file diff --git a/helm/kagent-tools/templates/servicemonitor.yaml b/helm/kagent-tools/templates/servicemonitor.yaml new file mode 100644 index 0000000..ded05cd --- /dev/null +++ b/helm/kagent-tools/templates/servicemonitor.yaml @@ -0,0 +1,23 @@ + +{{- if .Values.tools.metrics.servicemonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "kagent.fullname" . }} + namespace: {{ include "kagent.namespace" . }} + labels: + {{- toYaml .Values.tools.metrics.servicemonitor.labels | nindent 4 }} +spec: + selector: + matchLabels: + {{- include "kagent.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: metrics + namespaceSelector: + matchNames: + - {{ include "kagent.namespace" . }} + endpoints: + - port: prometheus-metrics + interval: {{ .Values.tools.metrics.servicemonitor.interval | default "30s" }} + scrapeTimeout: {{ .Values.tools.metrics.servicemonitor.scrapeTimeout | default "10s" }} + path: {{ .Values.tools.metrics.servicemonitor.path | default "/metrics" }} +{{- end }} diff --git a/helm/kagent-tools/values.yaml b/helm/kagent-tools/values.yaml index 556f56e..dd9ef09 100644 --- a/helm/kagent-tools/values.yaml +++ b/helm/kagent-tools/values.yaml @@ -5,6 +5,15 @@ global: tag: "" tools: + metrics: + # port defaults to the main --port value (same server). Set explicitly for a dedicated metrics port. + port: "" + servicemonitor: + enabled: false + interval: 30s + scrapeTimeout: 10s + labels: + release: prometheus loglevel: "debug" # List of tool providers to enable. Empty list means all tools are enabled. # Available: k8s, helm, istio, cilium, argo, prometheus, kubescape, utils diff --git a/internal/metrics/monitoring_server.go b/internal/metrics/monitoring_server.go new file mode 100644 index 0000000..275a01f --- /dev/null +++ b/internal/metrics/monitoring_server.go @@ -0,0 +1,69 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" +) + +// kAgent Tools MCP Server metrics definition +var ( + KagentToolsMCPServerInfo = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kagent_tools_mcp_server_info", + Help: "Information about the MCP server including version and build details", + }, + []string{ + "server_name", + "version", + "git_commit", + "build_date", + "server_mode", // e.g., "read-only" or "read-write" + }, + ) + + KagentToolsMCPRegisteredTools = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kagent_tools_mcp_registered_tools", + Help: "Set to 1 for each registered MCP tool provider", + }, + []string{ + "tool_name", + "tool_provider", + }, + ) + + KagentToolsMCPInvocationsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_tools_mcp_invocations_total", + Help: "Total number of MCP tool invocations", + }, + []string{"tool_name", "tool_provider"}, + ) + + KagentToolsMCPInvocationsFailureTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_tools_mcp_invocations_failure_total", + Help: "Total number of failed MCP tool invocations", + }, + []string{"tool_name", "tool_provider"}, + ) +) + +func InitServer() *prometheus.Registry { + // New registry for our custom metrics, separate from the default registry + registry := prometheus.NewRegistry() + + // Add Go runtime metrics ( goroutines, GC stats, etc. ) + registry.MustRegister(collectors.NewGoCollector()) + + // Add process metrics (CPU, memory, file descriptors, etc. ) + registry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) + + // Register kAgent Tools MCP Server metrics + registry.MustRegister(KagentToolsMCPServerInfo) + registry.MustRegister(KagentToolsMCPRegisteredTools) + registry.MustRegister(KagentToolsMCPInvocationsTotal) + registry.MustRegister(KagentToolsMCPInvocationsFailureTotal) + + return registry +} diff --git a/internal/metrics/monitoring_server_test.go b/internal/metrics/monitoring_server_test.go new file mode 100644 index 0000000..495c3e1 --- /dev/null +++ b/internal/metrics/monitoring_server_test.go @@ -0,0 +1,268 @@ +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" +) + +func TestInitServer_ReturnsRegistry(t *testing.T) { + registry := InitServer() + if registry == nil { + t.Fatal("InitServer() returned nil registry") + } +} + +func TestInitServer_GathersMetrics(t *testing.T) { + registry := InitServer() + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + if len(families) == 0 { + t.Fatal("Expected at least one metric family from Go/process collectors, got none") + } +} + +func TestInitServer_RegistersCustomMetrics(t *testing.T) { + registry := InitServer() + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + // Build a set of metric names for easy lookup + metricNames := make(map[string]bool) + for _, family := range families { + metricNames[family.GetName()] = true + } + + // Go and process collectors should be present + goMetrics := []string{ + "go_goroutines", + "go_memstats_alloc_bytes", + } + for _, name := range goMetrics { + if !metricNames[name] { + t.Errorf("Expected Go collector metric %q to be registered", name) + } + } +} + +func TestKagentToolsMCPServerInfo_SetAndGather(t *testing.T) { + registry := InitServer() + + // Set the server info metric + KagentToolsMCPServerInfo.WithLabelValues( + "test-server", + "v0.0.1", + "abc123", + "2026-02-12", + "read-write", + ).Set(1) + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + found := findMetricFamily(families, "kagent_tools_mcp_server_info") + if found == nil { + t.Fatal("Expected kagent_tools_mcp_server_info metric to be present") + } + + metrics := found.GetMetric() + if len(metrics) != 1 { + t.Fatalf("Expected 1 time series, got %d", len(metrics)) + } + + // Verify label values + expectedLabels := map[string]string{ + "server_name": "test-server", + "version": "v0.0.1", + "git_commit": "abc123", + "build_date": "2026-02-12", + "server_mode": "read-write", + } + + for _, label := range metrics[0].GetLabel() { + expected, ok := expectedLabels[label.GetName()] + if !ok { + t.Errorf("Unexpected label %q", label.GetName()) + continue + } + if label.GetValue() != expected { + t.Errorf("Label %q: expected %q, got %q", label.GetName(), expected, label.GetValue()) + } + } + + // Verify gauge value is 1 + if metrics[0].GetGauge().GetValue() != 1 { + t.Errorf("Expected gauge value 1, got %f", metrics[0].GetGauge().GetValue()) + } +} + +func TestKagentToolsMCPRegisteredTools_SetAndGather(t *testing.T) { + registry := InitServer() + + // Register a couple of tool providers + KagentToolsMCPRegisteredTools.WithLabelValues("kubectl_get", "k8s").Set(1) + KagentToolsMCPRegisteredTools.WithLabelValues("helm_list", "helm").Set(1) + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + found := findMetricFamily(families, "kagent_tools_mcp_registered_tools") + if found == nil { + t.Fatal("Expected kagent_tools_mcp_registered_tools metric to be present") + } + + metrics := found.GetMetric() + if len(metrics) != 2 { + t.Fatalf("Expected 2 time series (one per tool), got %d", len(metrics)) + } +} + +func TestKagentToolsMCPInvocationsTotal_IncAndGather(t *testing.T) { + registry := InitServer() + + // Simulate a few tool invocations + KagentToolsMCPInvocationsTotal.WithLabelValues("kubectl_get", "k8s").Inc() + KagentToolsMCPInvocationsTotal.WithLabelValues("kubectl_get", "k8s").Inc() + KagentToolsMCPInvocationsTotal.WithLabelValues("helm_list", "helm").Inc() + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + found := findMetricFamily(families, "kagent_tools_mcp_invocations_total") + if found == nil { + t.Fatal("Expected kagent_tools_mcp_invocations_total metric to be present") + } + + metrics := found.GetMetric() + if len(metrics) != 2 { + t.Fatalf("Expected 2 time series (one per tool), got %d", len(metrics)) + } + + // Find the kubectl_get series and verify its counter value is 2 + for _, m := range metrics { + for _, label := range m.GetLabel() { + if label.GetName() == "tool_name" && label.GetValue() == "kubectl_get" { + if m.GetCounter().GetValue() != 2 { + t.Errorf("Expected kubectl_get counter to be 2, got %f", m.GetCounter().GetValue()) + } + } + } + } +} + +func TestKagentToolsMCPInvocationsFailureTotal_IncAndGather(t *testing.T) { + registry := InitServer() + + // Simulate a tool failure + KagentToolsMCPInvocationsFailureTotal.WithLabelValues("helm_install", "helm").Inc() + + families, err := registry.Gather() + if err != nil { + t.Fatalf("Failed to gather metrics: %v", err) + } + + found := findMetricFamily(families, "kagent_tools_mcp_invocations_failure_total") + if found == nil { + t.Fatal("Expected kagent_tools_mcp_invocations_failure_total metric to be present") + } + + metrics := found.GetMetric() + if len(metrics) != 1 { + t.Fatalf("Expected 1 time series, got %d", len(metrics)) + } + + if metrics[0].GetCounter().GetValue() != 1 { + t.Errorf("Expected failure counter to be 1, got %f", metrics[0].GetCounter().GetValue()) + } + + // Verify labels + expectedLabels := map[string]string{ + "tool_name": "helm_install", + "tool_provider": "helm", + } + for _, label := range metrics[0].GetLabel() { + expected, ok := expectedLabels[label.GetName()] + if !ok { + t.Errorf("Unexpected label %q", label.GetName()) + continue + } + if label.GetValue() != expected { + t.Errorf("Label %q: expected %q, got %q", label.GetName(), expected, label.GetValue()) + } + } +} + +// findMetricFamily finds a metric family by name from a gathered slice +func findMetricFamily(families []*dto.MetricFamily, name string) *dto.MetricFamily { + for _, family := range families { + if family.GetName() == name { + return family + } + } + return nil +} + +// resetMetrics resets the global metric vectors so tests don't interfere with each other +func resetMetrics() { + KagentToolsMCPServerInfo = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kagent_tools_mcp_server_info", + Help: "Information about the MCP server including version and build details", + }, + []string{ + "server_name", + "version", + "git_commit", + "build_date", + "server_mode", + }, + ) + + KagentToolsMCPRegisteredTools = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kagent_tools_mcp_registered_tools", + Help: "Set to 1 for each registered MCP tool provider", + }, + []string{ + "tool_name", + "tool_provider", + }, + ) + + KagentToolsMCPInvocationsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_tools_mcp_invocations_total", + Help: "Total number of MCP tool invocations", + }, + []string{"tool_name", "tool_provider"}, + ) + + KagentToolsMCPInvocationsFailureTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_tools_mcp_invocations_failure_total", + Help: "Total number of failed MCP tool invocations", + }, + []string{"tool_name", "tool_provider"}, + ) +} + +func TestMain(m *testing.M) { + // Reset metrics before each test run to avoid "duplicate registration" panics + // since InitServer() registers the package-level vars into a new registry each time + resetMetrics() + m.Run() +}