Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ PROMETHEUS_METRICS_ENABLED=False
# This maps external port 2001 to the internal Prometheus metrics port
EXTERNAL_PROM_METRICS_PORT=2001

# -----------------------------------------------------------------------------
# Notifications monitoring (optional)
# -----------------------------------------------------------------------------
# Healthcheck.io private tokens/UUIDs used by cronjob monitoring.
# The public base URL is defined in code and not stored in env.
HEALTHCHECK_ID_DELETE_UNUSED_HARDWARE_STATUS=
HEALTHCHECK_ID_NOTIFICATIONS_HARDWARE_SUMMARY=
HEALTHCHECK_ID_NOTIFICATIONS_METRICS_SUMMARY=
HEALTHCHECK_ID_NOTIFICATIONS_NEW_ISSUES=
HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MICROSOFT=
HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MAESTRO=

# -----------------------------------------------------------------------------
# Email / Notifications (optional)
# -----------------------------------------------------------------------------
Expand Down
32 changes: 32 additions & 0 deletions backend/kernelCI/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,30 @@ def get_json_env_var(name, default):
# To run cronjobs locally, execute
# poetry run ./manage.py crontab arg
# where "arg" is add, remove or show

HEALTHCHECK_BASE_URL = "https://hc-ping.com"
HEALTHCHECK_MONITORING_PATH_MAP: dict[str, str] = {
"delete_unused_hardware_status": os.environ.get(
"HEALTHCHECK_ID_DELETE_UNUSED_HARDWARE_STATUS", ""
),
"notifications_hardware_summary": os.environ.get(
"HEALTHCHECK_ID_NOTIFICATIONS_HARDWARE_SUMMARY", ""
),
"notifications_metrics_summary": os.environ.get(
"HEALTHCHECK_ID_NOTIFICATIONS_METRICS_SUMMARY", ""
),
"notifications_new_issues": os.environ.get(
"HEALTHCHECK_ID_NOTIFICATIONS_NEW_ISSUES", ""
),
"notifications_summary_microsoft": os.environ.get(
"HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MICROSOFT", ""
),
"notifications_summary_maestro": os.environ.get(
"HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MAESTRO", ""
),
}
"""Maps monitoring_id to the relative_path that will be appended to the base healthcheck URL."""

SKIP_CRONJOBS = is_boolean_or_string_true(os.environ.get("SKIP_CRONJOBS", False))
if SKIP_CRONJOBS:
CRONJOBS = []
Expand All @@ -133,12 +157,15 @@ def get_json_env_var(name, default):
"CRONTAB_COMMAND_SUFFIX", ">> /proc/1/fd/1 2>&1"
)
CRONJOBS = [
# not using a monitoring_id in the first task since it should
# be removed once the denormalization is set in stone
("0 * * * *", "kernelCI_app.tasks.update_checkout_cache"),
(
"59 * * * *",
"django.core.management.call_command",
[
"notifications",
"--monitoring-id=notifications_new_issues",
"--action=new_issues",
"--to=kernelci-results@groups.io",
"--cc=gus@collabora.com",
Expand All @@ -151,6 +178,7 @@ def get_json_env_var(name, default):
"django.core.management.call_command",
[
"notifications",
"--monitoring-id=notifications_summary_microsoft",
"--action=summary",
"--to=kernelcialerts@microsoft.com",
"--cc=kernelci-results@groups.io",
Expand All @@ -165,6 +193,7 @@ def get_json_env_var(name, default):
"django.core.management.call_command",
[
"notifications",
"--monitoring-id=notifications_summary_maestro",
"--action=summary",
"--add-mailing-lists",
"--send",
Expand All @@ -177,6 +206,7 @@ def get_json_env_var(name, default):
"django.core.management.call_command",
[
"notifications",
"--monitoring-id=notifications_hardware_summary",
"--action=hardware_summary",
"--cc=kernelci-results@groups.io",
"--send",
Expand All @@ -188,13 +218,15 @@ def get_json_env_var(name, default):
"django.core.management.call_command",
[
"delete_unused_hardware_status",
"--monitoring-id=delete_unused_hardware_status",
],
),
(
"0 0 * * 6",
"django.core.management.call_command",
[
"notifications",
"--monitoring-id=notifications_metrics_summary",
"--action=metrics_summary",
"--to=kernelci@lists.linux.dev",
"--cc=kernelci-results@groups.io",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
import logging
from django.core.management.base import BaseCommand
from django.db import transaction
from kernelCI_app.management.commands.helpers.healthcheck import (
MONITORING_ID_PARAM_HELP_TEXT,
run_with_healthcheck_monitoring,
)
from kernelCI_app.models import HardwareStatus, LatestCheckout, ProcessedListingItems

logger = logging.getLogger(__name__)
Expand All @@ -30,8 +34,21 @@ def add_arguments(self, parser):
default=10000,
help="Number of records to delete per batch (default: 10000)",
)
parser.add_argument(
"--monitoring-id",
type=str,
default=None,
help=MONITORING_ID_PARAM_HELP_TEXT,
)

def handle(self, *args, **options):
monitoring_id = options.get("monitoring_id")
return run_with_healthcheck_monitoring(
monitoring_id=monitoring_id,
action=lambda: self._run_action(options),
)

def _run_action(self, options):
dry_run = options["dry_run"]
batch_size = options["batch_size"]

Expand Down
69 changes: 69 additions & 0 deletions backend/kernelCI_app/management/commands/helpers/healthcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from collections.abc import Callable
from typing import Any, Literal

from django.conf import settings

import requests

from kernelCI_app.helpers.logger import log_message

MONITORING_ID_PARAM_HELP_TEXT = (
"Monitoring ID configured in settings for healthcheck.io pings "
"(optional, used only for monitoring the command execution over time)"
)
type PingStatus = Literal["start", "fail", "success"]


def _resolve_monitoring_url(*, monitoring_id: str, status: PingStatus) -> str | None:
healthcheck_base_url: str = settings.HEALTHCHECK_BASE_URL
monitoring_path_map: dict[str, str] = settings.HEALTHCHECK_MONITORING_PATH_MAP
monitoring_path = monitoring_path_map.get(monitoring_id)

if not monitoring_path:
return None

# Success just needs to ping base healthcheck.io url + uuid, no subpath
status_suffix = f"/{status}" if status != "success" else ""

return f"{healthcheck_base_url.rstrip('/')}/{monitoring_path.lstrip('/')}{status_suffix}"


def _ping_healthcheck(*, monitoring_id: str, status: PingStatus) -> None:
monitoring_url = _resolve_monitoring_url(monitoring_id=monitoring_id, status=status)
if not monitoring_url:
log_message(
"No healthcheck URL configured for monitoring_id='%s', skipping %s ping."
% (monitoring_id, status)
)
return

try:
response = requests.get(monitoring_url, timeout=10)
response.raise_for_status()
log_message(
"Success at pinging healthcheck '%s' with monitoring_id '%s'"
% (monitoring_url, monitoring_id)
)
except requests.RequestException as e:
log_message(
"ERROR: failed to ping healthcheck for monitoring_id='%s' and status='%s': %s"
% (monitoring_id, status, e)
)


def run_with_healthcheck_monitoring(
*, monitoring_id: str | None, action: Callable[[], Any]
) -> Any:
if not monitoring_id:
return action()

_ping_healthcheck(monitoring_id=monitoring_id, status="start")

try:
result = action()
except Exception:
_ping_healthcheck(monitoring_id=monitoring_id, status="fail")
raise

_ping_healthcheck(monitoring_id=monitoring_id, status="success")
return result
17 changes: 17 additions & 0 deletions backend/kernelCI_app/management/commands/notifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
setup_jinja_template,
send_email_report,
)
from kernelCI_app.management.commands.helpers.healthcheck import (
MONITORING_ID_PARAM_HELP_TEXT,
run_with_healthcheck_monitoring,
)

from kernelCI_app.management.commands.helpers.summary import (
SIGNUP_FOLDER,
Expand Down Expand Up @@ -892,6 +896,12 @@ def add_arguments(self, parser):
action="store_true",
help="Ignore recipients.yaml file (optional for all actions)",
)
parser.add_argument(
"--monitoring-id",
type=str,
default=None,
help=MONITORING_ID_PARAM_HELP_TEXT,
)

# Action argument (replaces subparsers)
actions = [
Expand Down Expand Up @@ -972,6 +982,13 @@ def add_arguments(self, parser):
)

def handle(self, *args, **options):
monitoring_id = options.get("monitoring_id")
return run_with_healthcheck_monitoring(
monitoring_id=monitoring_id,
action=lambda: self._run_action(options),
)

def _run_action(self, options):
# Setup connections
service = smtp_setup_connection()

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from django.test import SimpleTestCase, override_settings
from unittest.mock import Mock, patch

from kernelCI_app.management.commands.helpers.healthcheck import (
_resolve_monitoring_url,
run_with_healthcheck_monitoring,
)

TEST_BASE_URL = "https://example.com"


@override_settings(
HEALTHCHECK_MONITORING_PATH_MAP={
"job-1": "private-token",
"job-2": "something/with/slashes",
},
HEALTHCHECK_BASE_URL=TEST_BASE_URL,
)
class TestRunWithHealthcheckMonitoring(SimpleTestCase):
def test_resolve_monitoring_url_success(self):
result = _resolve_monitoring_url(monitoring_id="job-1", status="start")
self.assertEqual(result, f"{TEST_BASE_URL}/private-token/start")

def test_resolve_monitoring_url_success_status_no_suffix(self):
result = _resolve_monitoring_url(monitoring_id="job-1", status="success")
self.assertEqual(result, f"{TEST_BASE_URL}/private-token")

@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
def test_success_path_pings_start_and_success(self, mock_get):
response = Mock()
response.raise_for_status.return_value = None
mock_get.return_value = response

result = run_with_healthcheck_monitoring(
monitoring_id="job-1", action=lambda: "ok"
)

assert result == "ok"
assert mock_get.call_count == 2
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token/start", timeout=10)
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token", timeout=10)

@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
def test_failure_path_pings_start_and_fail(self, mock_get):
response = Mock()
response.raise_for_status.return_value = None
mock_get.return_value = response

with self.assertRaisesRegex(RuntimeError, "boom"):
run_with_healthcheck_monitoring(
monitoring_id="job-1",
action=lambda: (_ for _ in ()).throw(RuntimeError("boom")),
)

assert mock_get.call_count == 2
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token/start", timeout=10)
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token/fail", timeout=10)

@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
def test_no_monitoring_id_skips_pings(self, mock_get):
result = run_with_healthcheck_monitoring(monitoring_id=None, action=lambda: 42)

assert result == 42
mock_get.assert_not_called()

@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
def test_unknown_monitoring_id_skips_network_and_runs_action(self, mock_get):
result = run_with_healthcheck_monitoring(
monitoring_id="missing-id", action=lambda: "ran"
)

assert result == "ran"
mock_get.assert_not_called()
17 changes: 17 additions & 0 deletions docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,23 @@ The monitoring system supports multi-worker Gunicorn deployments using Prometheu
- `PROMETHEUS_METRICS_PORT`: Port for the metrics aggregator (default: `8001`)
- `PROMETHEUS_MULTIPROC_DIR`: Directory for multiprocess metric files (default: `/tmp/prometheus_multiproc_dir`)

### Cronjob Healthchecks

The backend can ping healthcheck.io for cronjobs that run Django management commands.

- The public base URL is defined in code as `HEALTHCHECK_BASE_URL`.
- Private monitor tokens stay in environment variables and are mapped in Django settings.
- Each monitored cron run sends pings to `/start`, `/success`, and `/fail`.

Configure these variables in `.env.backend`:

- `HEALTHCHECK_ID_DELETE_UNUSED_HARDWARE_STATUS`
- `HEALTHCHECK_ID_NOTIFICATIONS_HARDWARE_SUMMARY`
- `HEALTHCHECK_ID_NOTIFICATIONS_METRICS_SUMMARY`
- `HEALTHCHECK_ID_NOTIFICATIONS_NEW_ISSUES`
- `HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MICROSOFT`
- `HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MAESTRO`

## `prometheus.yml`
- **Target**: `host.docker.internal:8001` (backend running locally)
- **Metrics Path**: `/metrics/`
Expand Down
Loading