diff --git a/.gitignore b/.gitignore index e4c214b..8f41f5b 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,10 @@ proxy-*/ .vscode/ .idea/ +# Python (audit-hotspots + tests) +__pycache__/ +*.pyc + # OS .DS_Store Thumbs.db diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 77ceb26..132da7f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,6 +7,7 @@ repos: entry: shellcheck -x language: system files: '^(bin/dotsec|bin/dotsec-build|lib/.*\.sh|tests/integration-smoke\.sh|tests/stubs/.*|\.github/scripts/.*\.sh|exegol/my-resources/bin/[^/]+|exegol/my-resources/deploy\.sh)$' + exclude: '^exegol/my-resources/bin/audit-hotspots$' - id: bats name: bats entry: bats tests/ diff --git a/Makefile b/Makefile index 199eb88..cefd083 100644 --- a/Makefile +++ b/Makefile @@ -131,7 +131,7 @@ test: ## Run bats tests @bats tests/ lint: ## shellcheck all bash (uses .shellcheckrc) - @shellcheck -x bin/dotsec bin/dotsec-build lib/*.sh tests/integration-smoke.sh exegol/my-resources/bin/* exegol/my-resources/deploy.sh && echo "[+] shellcheck clean" + @shellcheck -x bin/dotsec bin/dotsec-build lib/*.sh tests/integration-smoke.sh $$(ls exegol/my-resources/bin/* | grep -v '/audit-hotspots$$') exegol/my-resources/deploy.sh && echo "[+] shellcheck clean" smoke: ## Docker integration smoke (requires docker + make build) @bash tests/integration-smoke.sh diff --git a/README.md b/README.md index d70ffad..a2d729b 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,7 @@ via `make exegol-setup` (also run by `make install`). The bundle includes: - **recon** scripts: `recon-subs`, `recon-alive`, `recon-fingerprint`, `recon-portscan`, `recon-screenshot`, `recon-crawl`, `recon-urls`, `recon-loot`, `recon-extract`, `recon-sourcemaps`, `recon-full`, `dl` - **scan** scripts: `scan-nuclei` (vuln scan), `scan-takeover` (dangling CNAME; subzy → nuclei fallback) -- **audit** scripts: `audit-code` (trufflehog + gitleaks + semgrep + osv-scanner over the `code/` zone) +- **audit** scripts: `audit-code` (secrets/SAST/SCA), `audit-sinks` (dangerous functions), `audit-endpoints` (routes + JS surface), `audit-hotspots` (ranked candidates), `audit-full` - Shell aliases and preloaded history - `load_user_setup.sh`: idempotent installer for the tools the scripts need that the base image lacks (xnLinkFinder, waymore, sourcemapper, osv-scanner, …) @@ -167,6 +167,7 @@ recon-full # discovery → portscan → screenshots → crawl → loot → scan-nuclei # vulnerability scan of the alive hosts (routed through the proxy) scan-takeover # subdomain takeover check audit-code # white-box audit of recovered source / sourcemaps +audit-full # full white-box pass: secrets + SCA + sinks + endpoints + ranked hotspots ``` On first container start, Exegol auto-runs `/opt/my-resources/setup/load_user_setup.sh`. diff --git a/docs/plans/2026-06-12-code-audit-suite-plan.md b/docs/plans/2026-06-12-code-audit-suite-plan.md new file mode 100644 index 0000000..56e9721 --- /dev/null +++ b/docs/plans/2026-06-12-code-audit-suite-plan.md @@ -0,0 +1,708 @@ +# Code Audit Suite Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a composable white-box code-audit suite (`audit-sinks`, `audit-endpoints`, `audit-hotspots`, `audit-full`) over the engagement `code/` zone, emitting AI-readable structured artifacts plus a ranked hotspots report. + +**Architecture:** Bash orchestrator scripts run inside Exegol, env-driven (`$WORKSPACE`), writing `*.json` + summaries to `$WORKSPACE/scans/code/`. semgrep is the confident primary engine; ast-grep / weggli / xnLinkFinder are guarded best-effort enhancers. `audit-hotspots` is a pure-python aggregator (host-unit-testable). Bundled semgrep/ast-grep rules ship in `exegol/my-resources/audit-rules/`. + +**Tech Stack:** bash (shellcheck-clean), python3 (stdlib only), semgrep, ast-grep, weggli, xnLinkFinder, bats + stubs. + +--- + +## Conventions (read once) + +- New bash scripts go in `exegol/my-resources/bin/`, mode `0755`, `#!/usr/bin/env bash`, `set -euo pipefail`, the standard DOMAIN/dir guards. They must be shellcheck-clean (`shellcheck -x`), now enforced by pre-commit (`exegol/my-resources/bin/[^/]+`). +- `audit-hotspots` is python3 and MUST be excluded from shellcheck. +- Tools live in the Exegol container, not the host. Real-detection tests run in the container (`docker exec exegol-e-voting ...`); host bats tests use stubs in `tests/stubs/`. +- Output dir is always `$WORKSPACE/scans/code/` (created by `dotsec new`, PR #16). +- Commit messages: English, Conventional Commits, no `Co-Authored-By`. Stage explicit paths (`git commit -m "..." -- path`). +- The running container for live checks is `exegol-e-voting`. + +## File Structure + +| File | Responsibility | +|------|----------------| +| `exegol/my-resources/bin/audit-sinks` (new) | semgrep + ast-grep + weggli sink scan → `sinks.json`/`sinks.txt` | +| `exegol/my-resources/bin/audit-endpoints` (new) | semgrep route rules + xnLinkFinder + grep → `endpoints.json`/`endpoints.txt` | +| `exegol/my-resources/bin/audit-hotspots` (new, python) | aggregate+rank all `scans/code/*.json` → `hotspots.json`/`hotspots.md` | +| `exegol/my-resources/bin/audit-full` (new) | orchestrate audit-code → sinks → endpoints → hotspots | +| `exegol/my-resources/audit-rules/endpoints.yml` (new) | semgrep custom route-definition rules | +| `exegol/my-resources/audit-rules/sgconfig-sinks.yml` + `astgrep-sinks/*.yml` (new) | ast-grep polyglot sink rules | +| `exegol/my-resources/fragments/load_user_setup.dotsec.sh` (modify) | install ast-grep + weggli | +| `exegol/my-resources/deploy.sh` (modify) | copy `audit-rules/` into the volume | +| `exegol/my-resources/fragments/aliases.dotsec` + `history.dotsec` (modify) | shortcuts + history for audit-* | +| `.pre-commit-config.yaml` + `Makefile` (modify) | exclude `audit-hotspots` (python) from shellcheck | +| `tests/audit.bats` (new) | stub-based wiring tests for the bash scripts | +| `tests/test_audit_hotspots.py` (new) | host unit tests for the python ranker | +| `README.md` (modify) | document the audit stage | + +--- + +## Task 1: Install the audit engines + +**Files:** +- Modify: `exegol/my-resources/fragments/load_user_setup.dotsec.sh` + +- [ ] **Step 1: Add ast-grep + weggli installs** + +Insert after the existing `_dotsec_release osv-scanner ...` block, before the final `echo`: + +```bash +# ── code audit: structural search engines (audit-sinks/audit-endpoints) ── +# ast-grep installs the `ast-grep` binary (it also ships `sg`, but /usr/bin/sg is +# the unrelated setgid tool — scripts always call `ast-grep`). +_dotsec_have ast-grep || npm install -g @ast-grep/cli >/dev/null 2>&1 || true +# weggli: C/C++ semantic grep (slow cargo build, best-effort). +_dotsec_have weggli || cargo install weggli --locked >/dev/null 2>&1 || true +``` + +- [ ] **Step 2: shellcheck the fragment** + +Run: `shellcheck -x exegol/my-resources/fragments/load_user_setup.dotsec.sh` +Expected: no output (clean). + +- [ ] **Step 3: Deploy + install live, verify binaries + pin CLIs** + +```bash +DOTSEC_HOME="$PWD" bash exegol/my-resources/deploy.sh +docker exec exegol-e-voting bash -lc 'npm install -g @ast-grep/cli >/dev/null 2>&1; command -v ast-grep; ast-grep --version; ast-grep scan --help 2>&1 | grep -iE "json|config|--rule|-c," | head' +``` +Expected: `ast-grep` resolves, version prints. **Record the exact flags** for `scan` (config flag is `-c`/`--config`, JSON flag is `--json` or `--json=stream`) — they are used verbatim in Tasks 3-4. weggli build may take minutes; it is optional, do not block on it. + +- [ ] **Step 4: Commit** + +```bash +git commit -m "feat(audit): install ast-grep + weggli engines" -- exegol/my-resources/fragments/load_user_setup.dotsec.sh +``` + +--- + +## Task 2: semgrep endpoint rules + +**Files:** +- Create: `exegol/my-resources/audit-rules/endpoints.yml` + +- [ ] **Step 1: Write the rules file** + +```yaml +# dotsec audit: backend route/handler definitions across common frameworks. +# Used by audit-endpoints (semgrep --config). INFO severity: these are surface, +# not vulns. +rules: + - id: express-route + languages: [javascript, typescript] + severity: INFO + message: "express/koa route" + patterns: + - pattern-either: + - pattern: $APP.get("$P", ...) + - pattern: $APP.post("$P", ...) + - pattern: $APP.put("$P", ...) + - pattern: $APP.delete("$P", ...) + - pattern: $APP.patch("$P", ...) + - pattern: $APP.all("$P", ...) + - id: flask-fastapi-route + languages: [python] + severity: INFO + message: "flask/fastapi route" + patterns: + - pattern-either: + - pattern: "@$APP.route(\"$P\", ...)" + - pattern: "@$APP.get(\"$P\", ...)" + - pattern: "@$APP.post(\"$P\", ...)" + - pattern: "@$ROUTER.get(\"$P\", ...)" + - pattern: "@$ROUTER.post(\"$P\", ...)" + - id: spring-mapping + languages: [java] + severity: INFO + message: "spring request mapping" + patterns: + - pattern-either: + - pattern: "@RequestMapping(...)" + - pattern: "@GetMapping(...)" + - pattern: "@PostMapping(...)" + - id: go-http-handler + languages: [go] + severity: INFO + message: "go http handler" + patterns: + - pattern-either: + - pattern: $MUX.HandleFunc("$P", ...) + - pattern: http.HandleFunc("$P", ...) +``` + +- [ ] **Step 2: Validate the rules with semgrep** + +Run: `docker exec exegol-e-voting bash -lc 'semgrep --validate --config /opt/my-resources/audit-rules/endpoints.yml 2>&1 | tail -3'` +(Deploy first via `DOTSEC_HOME="$PWD" bash exegol/my-resources/deploy.sh` — but note deploy.sh does not yet copy audit-rules; copy manually for this check: `mkdir -p ~/.exegol/my-resources/audit-rules && cp exegol/my-resources/audit-rules/endpoints.yml ~/.exegol/my-resources/audit-rules/`) +Expected: `Configuration is valid` (or no rule-parse errors). Fix any pattern that fails to parse. + +- [ ] **Step 3: Commit** + +```bash +git add exegol/my-resources/audit-rules/endpoints.yml +git commit -m "feat(audit): semgrep route-definition rules" -- exegol/my-resources/audit-rules/endpoints.yml +``` + +--- + +## Task 3: `audit-hotspots` ranker (python, TDD on host) + +This is pure logic — write it test-first on the host (no container needed). + +**Files:** +- Create: `tests/test_audit_hotspots.py` +- Create: `exegol/my-resources/bin/audit-hotspots` + +- [ ] **Step 1: Write the failing test** + +`tests/test_audit_hotspots.py`: + +```python +import json, os, subprocess, sys +from pathlib import Path + +SCRIPT = Path(__file__).resolve().parent.parent / "exegol/my-resources/bin/audit-hotspots" + +def _run(ws: Path): + env = {**os.environ, "WORKSPACE": str(ws)} + return subprocess.run([sys.executable, str(SCRIPT)], env=env, capture_output=True, text=True) + +def test_ranks_error_above_info(tmp_path): + outd = tmp_path / "scans" / "code"; outd.mkdir(parents=True) + (outd / "sinks.json").write_text(json.dumps({"results": [ + {"check_id": "eval-injection", "path": "a.js", "start": {"line": 5}, + "extra": {"severity": "ERROR", "message": "eval sink"}}, + {"check_id": "weak-rng", "path": "b.js", "start": {"line": 9}, + "extra": {"severity": "INFO", "message": "weak rng"}}, + ]})) + r = _run(tmp_path) + assert r.returncode == 0, r.stderr + ranked = json.loads((outd / "hotspots.json").read_text()) + assert [f["rule"] for f in ranked] == ["eval-injection", "weak-rng"] + assert (outd / "hotspots.md").exists() + +def test_verified_secret_outranks_sink(tmp_path): + outd = tmp_path / "scans" / "code"; outd.mkdir(parents=True) + (outd / "sinks.json").write_text(json.dumps({"results": [ + {"check_id": "eval-injection", "path": "a.js", "start": {"line": 5}, + "extra": {"severity": "ERROR", "message": "eval"}}]})) + (outd / "secrets_trufflehog.json").write_text( + json.dumps({"SourceMetadata": {"Data": {"Filesystem": {"file": "c.env"}}}, + "DetectorName": "AWS", "Verified": True}) + "\n") + r = _run(tmp_path) + assert r.returncode == 0, r.stderr + ranked = json.loads((outd / "hotspots.json").read_text()) + assert ranked[0]["category"] == "secret-verified" + +def test_no_inputs_is_clean(tmp_path): + (tmp_path / "scans" / "code").mkdir(parents=True) + r = _run(tmp_path) + assert r.returncode == 0 + assert json.loads((tmp_path / "scans/code/hotspots.json").read_text()) == [] +``` + +- [ ] **Step 2: Run it to verify it fails** + +Run: `python3 -m pytest tests/test_audit_hotspots.py -q` (or `pytest`) +Expected: FAIL (script does not exist / no such file). + +- [ ] **Step 3: Write `exegol/my-resources/bin/audit-hotspots`** + +```python +#!/usr/bin/env python3 +"""Aggregate the audit JSON artifacts in scans/code/ into a ranked hotspots +report (hotspots.json + hotspots.md). Pure stdlib; reads whatever is present.""" +from __future__ import annotations + +import json +import os +from pathlib import Path + +SEVERITY_SCORE = {"ERROR": 4, "WARNING": 3, "HIGH": 4, "MEDIUM": 3, "LOW": 2, "INFO": 1} + + +def load_json(path: Path) -> object | None: + """Return parsed JSON, or None if missing/unparseable.""" + try: + return json.loads(path.read_text()) + except Exception: + return None + + +def load_jsonl(path: Path) -> list[dict]: + """Return a list of objects from a JSON-lines file (trufflehog format).""" + out: list[dict] = [] + try: + for line in path.read_text().splitlines(): + line = line.strip() + if line: + out.append(json.loads(line)) + except Exception: + pass + return out + + +def from_semgrep(data: object, category: str) -> list[dict]: + """Flatten a semgrep JSON report into scored findings.""" + findings: list[dict] = [] + results = data.get("results", []) if isinstance(data, dict) else [] + for r in results: + extra = r.get("extra", {}) + sev = str(extra.get("severity", "INFO")).upper() + findings.append({ + "category": category, + "rule": r.get("check_id", "?"), + "file": r.get("path", "?"), + "line": r.get("start", {}).get("line", 0), + "message": str(extra.get("message", ""))[:200], + "score": SEVERITY_SCORE.get(sev, 1), + }) + return findings + + +def from_trufflehog(records: list[dict]) -> list[dict]: + """Verified secrets rank highest; unverified are still surfaced.""" + findings: list[dict] = [] + for rec in records: + verified = bool(rec.get("Verified")) + meta = rec.get("SourceMetadata", {}).get("Data", {}).get("Filesystem", {}) + findings.append({ + "category": "secret-verified" if verified else "secret", + "rule": rec.get("DetectorName", "secret"), + "file": meta.get("file", "?"), + "line": meta.get("line", 0), + "message": "verified live credential" if verified else "potential secret", + "score": 5 if verified else 2, + }) + return findings + + +def main() -> int: + outd = Path(os.environ.get("WORKSPACE", ".")) / "scans" / "code" + findings: list[dict] = [] + findings += from_semgrep(load_json(outd / "sinks.json"), "sink") + findings += from_semgrep(load_json(outd / "endpoints.json"), "endpoint") + findings += from_trufflehog(load_jsonl(outd / "secrets_trufflehog.json")) + + findings.sort(key=lambda f: (f["score"], f["category"]), reverse=True) + + outd.mkdir(parents=True, exist_ok=True) + (outd / "hotspots.json").write_text(json.dumps(findings, indent=2)) + + lines = ["# Code audit hotspots", "", f"{len(findings)} candidates (ranked).", ""] + lines += ["| score | category | rule | file:line | note |", + "|------:|----------|------|-----------|------|"] + for f in findings: + lines.append(f"| {f['score']} | {f['category']} | `{f['rule']}` | " + f"`{f['file']}:{f['line']}` | {f['message']} |") + (outd / "hotspots.md").write_text("\n".join(lines) + "\n") + print(f"[+] {len(findings)} hotspots -> {outd}/hotspots.json (+ hotspots.md)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) +``` + +- [ ] **Step 4: Make it executable and run the tests** + +```bash +chmod +x exegol/my-resources/bin/audit-hotspots +python3 -m pytest tests/test_audit_hotspots.py -q +``` +Expected: 3 passed. + +- [ ] **Step 5: Exclude it from shellcheck (it is python)** + +In `.pre-commit-config.yaml`, the shellcheck hook gains an `exclude`: + +```yaml + - id: shellcheck + name: shellcheck + entry: shellcheck -x + language: system + files: '^(bin/dotsec|bin/dotsec-build|lib/.*\.sh|tests/integration-smoke\.sh|tests/stubs/.*|\.github/scripts/.*\.sh|exegol/my-resources/bin/[^/]+|exegol/my-resources/deploy\.sh)$' + exclude: '^exegol/my-resources/bin/audit-hotspots$' +``` + +In `Makefile`, the `lint` target excludes it from the glob. Replace the `exegol/my-resources/bin/*` argument with an explicit filter: + +```make + @shellcheck -x bin/dotsec bin/dotsec-build lib/*.sh tests/integration-smoke.sh $$(ls exegol/my-resources/bin/* | grep -v '/audit-hotspots$$') exegol/my-resources/deploy.sh && echo "[+] shellcheck clean" +``` + +- [ ] **Step 6: Verify lint still clean** + +Run: `make lint` +Expected: `[+] shellcheck clean` (audit-hotspots not shellcheck'd). + +- [ ] **Step 7: Commit** + +```bash +git add tests/test_audit_hotspots.py exegol/my-resources/bin/audit-hotspots .pre-commit-config.yaml Makefile +git commit -m "feat(audit): hotspots ranker (python) + lint exclude" -- tests/test_audit_hotspots.py exegol/my-resources/bin/audit-hotspots .pre-commit-config.yaml Makefile +``` + +--- + +## Task 4: `audit-sinks` script + +**Files:** +- Create: `exegol/my-resources/bin/audit-sinks` +- Create: `tests/stubs/semgrep` +- Modify: `tests/audit.bats` (created here) + +- [ ] **Step 1: Write the failing bats test (wiring, stubbed semgrep)** + +Create `tests/stubs/semgrep` (mode 0755): + +```bash +#!/usr/bin/env bash +# stub: emit a fixed semgrep JSON report to the -o path +out="" +while [[ $# -gt 0 ]]; do case "$1" in -o) out="$2"; shift 2;; *) shift;; esac; done +cat > "$out" <<'JSON' +{"results":[{"check_id":"js-eval","path":"a.js","start":{"line":3},"extra":{"severity":"ERROR","message":"eval sink"}}]} +JSON +``` + +Create `tests/audit.bats`: + +```bash +#!/usr/bin/env bats +setup() { + BIN="${BATS_TEST_DIRNAME}/../exegol/my-resources/bin" + export PATH="${BATS_TEST_DIRNAME}/stubs:${PATH}" + WS="$(mktemp -d)"; export WORKSPACE="$WS" + mkdir -p "$WS/code"; echo 'eval(x)' > "$WS/code/a.js" +} +teardown() { rm -rf "$WS"; } + +@test "audit-sinks writes sinks.json from semgrep" { + run "$BIN/audit-sinks" + [ "$status" -eq 0 ] + [ -f "$WS/scans/code/sinks.json" ] + grep -q "js-eval" "$WS/scans/code/sinks.json" +} + +@test "audit-sinks errors on empty code dir" { + rm -rf "$WS/code"; mkdir -p "$WS/code" + run "$BIN/audit-sinks" + [ "$status" -ne 0 ] +} +``` + +- [ ] **Step 2: Run to verify it fails** + +Run: `bats tests/audit.bats` +Expected: FAIL (audit-sinks missing). + +- [ ] **Step 3: Write `exegol/my-resources/bin/audit-sinks`** + +```bash +#!/usr/bin/env bash +set -euo pipefail +WS="${WORKSPACE:-$PWD}" +CODE="${1:-$WS/code}" +OUTD="$WS/scans/code" +RULES="${MYRES:-/opt/my-resources}/audit-rules" +mkdir -p "$OUTD" + +# Dangerous-function / sink scan over the recovered code. +[[ -d "$CODE" ]] || { echo "[!] no dir: $CODE"; exit 1; } +[[ -n "$(find "$CODE" -type f -print -quit 2>/dev/null)" ]] \ + || { echo "[!] $CODE is empty — run recon-sourcemaps / recon-loot first"; exit 1; } +echo "[*] sink scan $CODE" +: > "$OUTD/sinks.txt" + +# 1. semgrep security rulesets (primary, polyglot, JSON). +if command -v semgrep >/dev/null 2>&1; then + semgrep scan --config p/security-audit --config p/owasp-top-ten \ + --json -o "$OUTD/sinks.json" "$CODE" >/dev/null 2>&1 || true + echo "semgrep: p/security-audit + p/owasp-top-ten -> sinks.json" >> "$OUTD/sinks.txt" +fi + +# 2. ast-grep custom polyglot patterns (best-effort). Binary is `ast-grep`, +# NEVER `sg` (/usr/bin/sg is the unrelated setgid tool). +if command -v ast-grep >/dev/null 2>&1 && [[ -f "$RULES/sgconfig-sinks.yml" ]]; then + ast-grep scan -c "$RULES/sgconfig-sinks.yml" --json "$CODE" \ + > "$OUTD/sinks_astgrep.json" 2>/dev/null || true + echo "ast-grep: $RULES/sgconfig-sinks.yml -> sinks_astgrep.json" >> "$OUTD/sinks.txt" +fi + +# 3. weggli C/C++ semantic patterns (best-effort, C/C++ only). +if command -v weggli >/dev/null 2>&1; then + for pat in 'strcpy(_,_);' 'memcpy(_,_,_);' 'system(_);' 'sprintf(_,_);' 'gets(_);'; do + weggli "{ $pat }" "$CODE" 2>/dev/null >> "$OUTD/sinks.txt" || true + done +fi +echo "[+] sinks -> $OUTD/sinks.json (+ sinks.txt)" +``` + +- [ ] **Step 4: Make executable, run bats** + +```bash +chmod +x exegol/my-resources/bin/audit-sinks +bats tests/audit.bats +``` +Expected: the two audit-sinks tests pass. + +- [ ] **Step 5: Verify ast-grep invocation live, fix flags if needed** + +Using the flags recorded in Task 1 Step 3, confirm `ast-grep scan -c --json ` is valid. If the JSON flag differs (e.g. `--json=stream`), update the script. (ast-grep rules themselves are added in Task 6; this step only confirms the CLI shape so the guard is correct.) + +- [ ] **Step 6: shellcheck + commit** + +```bash +shellcheck -x exegol/my-resources/bin/audit-sinks tests/stubs/semgrep +git add exegol/my-resources/bin/audit-sinks tests/stubs/semgrep tests/audit.bats +git commit -m "feat(audit): audit-sinks (semgrep + ast-grep + weggli)" -- exegol/my-resources/bin/audit-sinks tests/stubs/semgrep tests/audit.bats +``` + +--- + +## Task 5: `audit-endpoints` script + +**Files:** +- Create: `exegol/my-resources/bin/audit-endpoints` +- Modify: `tests/audit.bats` + +- [ ] **Step 1: Add failing bats test** + +Append to `tests/audit.bats`: + +```bash +@test "audit-endpoints writes endpoints.json from semgrep" { + cp "${BATS_TEST_DIRNAME}/../exegol/my-resources/audit-rules/endpoints.yml" /dev/null 2>/dev/null || true + export MYRES="${BATS_TEST_DIRNAME}/../exegol/my-resources" + run "$BIN/audit-endpoints" + [ "$status" -eq 0 ] + [ -f "$WS/scans/code/endpoints.json" ] +} +``` + +The semgrep stub already writes a fixed report to `-o`; that satisfies "writes endpoints.json". (The stub ignores `--config`, so the rules path only needs to exist.) + +- [ ] **Step 2: Run to verify it fails** + +Run: `bats tests/audit.bats` +Expected: the new endpoints test FAILS (script missing). + +- [ ] **Step 3: Write `exegol/my-resources/bin/audit-endpoints`** + +```bash +#!/usr/bin/env bash +set -euo pipefail +WS="${WORKSPACE:-$PWD}" +CODE="${1:-$WS/code}" +OUTD="$WS/scans/code" +RULES="${MYRES:-/opt/my-resources}/audit-rules" +mkdir -p "$OUTD" + +# Attack-surface extraction: backend routes + frontend API calls. +[[ -d "$CODE" ]] || { echo "[!] no dir: $CODE"; exit 1; } +[[ -n "$(find "$CODE" -type f -print -quit 2>/dev/null)" ]] \ + || { echo "[!] $CODE is empty"; exit 1; } +echo "[*] endpoint scan $CODE" + +# 1. Backend routes via semgrep custom rules. +if command -v semgrep >/dev/null 2>&1 && [[ -f "$RULES/endpoints.yml" ]]; then + semgrep scan --config "$RULES/endpoints.yml" \ + --json -o "$OUTD/endpoints.json" "$CODE" >/dev/null 2>&1 || true +fi + +# 2. Frontend: endpoints referenced from JS + raw URL/fetch/axios literals. +: > "$OUTD/endpoints.txt" +if command -v xnLinkFinder >/dev/null 2>&1; then + xnLinkFinder -i "$CODE" -sf "${DOMAIN:-}" -o "$OUTD/_xn.txt" >/dev/null 2>&1 || true + [[ -f "$OUTD/_xn.txt" ]] && cat "$OUTD/_xn.txt" >> "$OUTD/endpoints.txt" && rm -f "$OUTD/_xn.txt" +fi +grep -rhoEi "https?://[^\"'\` )]+|fetch\(|axios\.[a-z]+\(|XMLHttpRequest" "$CODE" 2>/dev/null \ + | sort -u >> "$OUTD/endpoints.txt" || true +sort -u "$OUTD/endpoints.txt" -o "$OUTD/endpoints.txt" 2>/dev/null || true +echo "[+] endpoints -> $OUTD/endpoints.json (+ endpoints.txt)" +``` + +- [ ] **Step 4: Run bats** + +Run: `bats tests/audit.bats` +Expected: all tests pass. + +- [ ] **Step 5: Verify xnLinkFinder dir-input flags live** + +Run: `docker exec exegol-e-voting bash -lc 'xnLinkFinder --help 2>&1 | grep -iE "\-i |\-o |\-sf|directory" | head'` +Confirm `-i ` accepts a directory and `-o ` is the output flag. If xnLinkFinder needs a file list instead of a dir, adjust to iterate `find "$CODE" -name '*.js'`. (xnLinkFinder is guarded by `command -v`, so this only refines behavior, never breaks the script.) + +- [ ] **Step 6: shellcheck + commit** + +```bash +shellcheck -x exegol/my-resources/bin/audit-endpoints +git add exegol/my-resources/bin/audit-endpoints tests/audit.bats +git commit -m "feat(audit): audit-endpoints (routes + JS surface)" -- exegol/my-resources/bin/audit-endpoints tests/audit.bats +``` + +--- + +## Task 6: ast-grep sink rules + `audit-full` + wiring + +**Files:** +- Create: `exegol/my-resources/audit-rules/sgconfig-sinks.yml` +- Create: `exegol/my-resources/audit-rules/astgrep-sinks/dangerous.yml` +- Create: `exegol/my-resources/bin/audit-full` +- Modify: `exegol/my-resources/deploy.sh` +- Modify: `exegol/my-resources/fragments/aliases.dotsec`, `history.dotsec` + +- [ ] **Step 1: ast-grep sink rules** + +`exegol/my-resources/audit-rules/sgconfig-sinks.yml`: + +```yaml +ruleDirs: + - astgrep-sinks +``` + +`exegol/my-resources/audit-rules/astgrep-sinks/dangerous.yml`: + +```yaml +id: js-eval +language: javascript +severity: warning +message: dynamic eval (code-injection sink) +rule: + pattern: eval($A) +--- +id: js-innerhtml +language: javascript +severity: warning +message: innerHTML assignment (DOM XSS sink) +rule: + pattern: $X.innerHTML = $A +--- +id: py-os-system +language: python +severity: warning +message: os.system (command-injection sink) +rule: + pattern: os.system($A) +``` + +- [ ] **Step 2: Validate ast-grep rules live** + +Run (deploy first): `docker exec exegol-e-voting bash -lc 'ast-grep scan -c /opt/my-resources/audit-rules/sgconfig-sinks.yml --json /opt/my-resources/audit-rules 2>&1 | head -c 200'` +Expected: valid JSON (possibly `[]`). Fix rule syntax to match the version's schema if it errors (record working schema). + +- [ ] **Step 3: Write `exegol/my-resources/bin/audit-full`** + +```bash +#!/usr/bin/env bash +set -euo pipefail +WS="${WORKSPACE:-$PWD}" +CODE="${1:-$WS/code}" + +# Full white-box audit pass over the code/ zone. +echo "[*] full code audit for $CODE" +audit-code "$CODE" || echo "[i] audit-code step skipped" +audit-sinks "$CODE" || echo "[i] sinks step skipped" +audit-endpoints "$CODE" || echo "[i] endpoints step skipped" +audit-hotspots || echo "[i] hotspots step skipped" +echo "[+] done -> ${WS}/scans/code (see hotspots.md)" +``` + +- [ ] **Step 4: deploy.sh copies audit-rules/** + +In `exegol/my-resources/deploy.sh`, after the bin-copy loop, add: + +```bash +if [[ -d "${SRC}/audit-rules" ]]; then + mkdir -p "${DEST}/audit-rules" + cp -r "${SRC}/audit-rules/." "${DEST}/audit-rules/" +fi +``` + +- [ ] **Step 5: aliases + history** + +In `exegol/my-resources/fragments/aliases.dotsec`, under the phase shortcuts, append: + +```bash +alias sinks='audit-sinks' +alias endpoints='audit-endpoints' +alias hotspots='audit-hotspots' +``` + +In `exegol/my-resources/fragments/history.dotsec`, append: + +```bash +: 0:0;audit-full code/ +: 0:0;audit-sinks code/ && audit-hotspots +: 0:0;semgrep scan --config p/security-audit --json code/ +``` + +- [ ] **Step 6: shellcheck + bats + commit** + +```bash +shellcheck -x exegol/my-resources/bin/audit-full exegol/my-resources/deploy.sh +bats tests/audit.bats +git add exegol/my-resources/audit-rules exegol/my-resources/bin/audit-full exegol/my-resources/deploy.sh exegol/my-resources/fragments/aliases.dotsec exegol/my-resources/fragments/history.dotsec +git commit -m "feat(audit): ast-grep sink rules, audit-full orchestrator + wiring" -- exegol/my-resources/audit-rules exegol/my-resources/bin/audit-full exegol/my-resources/deploy.sh exegol/my-resources/fragments/aliases.dotsec exegol/my-resources/fragments/history.dotsec +``` + +--- + +## Task 7: Live smoke + docs + +**Files:** +- Modify: `README.md` + +- [ ] **Step 1: Deploy + real-detection smoke in the container** + +```bash +DOTSEC_HOME="$PWD" bash exegol/my-resources/deploy.sh +docker exec exegol-e-voting bash -lc ' + export PATH="/opt/my-resources/bin:$PATH" WORKSPACE=/tmp/audit-smoke + rm -rf "$WORKSPACE"; mkdir -p "$WORKSPACE/code" + printf "const x=req.query.q; eval(x);\napp.get(\"/admin\", h);\n" > "$WORKSPACE/code/app.js" + audit-full /tmp/audit-smoke/code + echo "--- hotspots.md ---"; cat "$WORKSPACE/scans/code/hotspots.md" + rm -rf "$WORKSPACE"' +``` +Expected: `sinks.json`, `endpoints.json`, `hotspots.json`, `hotspots.md` produced; the `eval` appears as a sink and `/admin` as an endpoint in `hotspots.md`. If a tool is missing it degrades, but semgrep findings + hotspots must be present. + +- [ ] **Step 2: Document the audit stage in README** + +In `README.md`, under the Exegol provisioning "Typical flow", add after `audit-code`: + +```markdown +audit-full # full white-box pass: secrets + SCA + sinks + endpoints + ranked hotspots +``` + +And in the bundle "audit scripts" bullet, replace the line with: + +```markdown +- **audit** scripts: `audit-code` (secrets/SAST/SCA), `audit-sinks` (dangerous functions), `audit-endpoints` (routes + JS surface), `audit-hotspots` (ranked candidates), `audit-full` +``` + +- [ ] **Step 3: Final full verification** + +```bash +make lint +make test +python3 -m pytest tests/test_audit_hotspots.py -q +``` +Expected: shellcheck clean; all bats pass; pytest 3 passed. + +- [ ] **Step 4: Commit** + +```bash +git add README.md +git commit -m "docs(audit): document the white-box audit stage" -- README.md +``` + +--- + +## Self-Review (completed by author) + +- **Spec coverage:** audit-sinks (Task 4 + ast-grep rules Task 6), audit-endpoints (Task 5 + rules Task 2), audit-code (unchanged, referenced in audit-full), audit-hotspots (Task 3), audit-full (Task 6), install (Task 1), bundled rules + deploy (Task 2/6), aliases/history (Task 6), lint exclude (Task 3), tests (Task 3/4/5), honest limits (best-effort guards throughout). All spec sections mapped. +- **Placeholders:** none — every code step shows complete content; the three "verify CLI live" steps (ast-grep/xnLinkFinder/weggli) give the exact verification command and the fallback, because those binaries are not yet installed to hardcode against. They are guarded by `command -v`, so the scripts are correct regardless. +- **Type consistency:** `from_semgrep`/`from_trufflehog`/`load_json`/`load_jsonl` names match between definition and call; finding dict keys (`category`,`rule`,`file`,`line`,`message`,`score`) are identical in the python, the markdown writer, and the tests; output filenames (`sinks.json`,`endpoints.json`,`secrets_trufflehog.json`,`hotspots.json`,`hotspots.md`) are consistent across scripts, tests, and the ranker. diff --git a/docs/specs/2026-06-12-code-audit-suite.md b/docs/specs/2026-06-12-code-audit-suite.md new file mode 100644 index 0000000..9337e28 --- /dev/null +++ b/docs/specs/2026-06-12-code-audit-suite.md @@ -0,0 +1,110 @@ +# Code audit suite (`audit-*`) — design + +Date: 2026-06-12 +Status: design approved (brainstorm), pending implementation plan. + +## Goal + +Automated, CLI/scriptable white-box source-code audit helpers over the engagement +`code/` zone (recovered sources, leaks, `.git` dumps, repos, white-box assignments +such as the e-voting case). Two consumers: + +1. **Operator**: a ranked hotspots report. +2. **AI**: every tool emits structured, machine-readable logs an AI can read to + accelerate the deep analysis afterward. + +Polyglot and language-adaptive: it works on whatever source lands in `code/`, +front-end or back-end, no language assumption. + +## Non-goals + +- **No magic business-logic bug detection.** Logic flaws require human/AI reasoning. + The suite only surfaces the *structure* (entry points, sinks, authz candidates) + to reason over. It accelerates the analysis; it does not find the logic bug. +- **No AI-specific packaging** (no repomix/code2prompt). AI-readability is a + cross-cutting property: structured `*.json` + a `*.txt`/`*.md` summary per tool. + +## Components + +Composable bash scripts in `exegol/my-resources/bin/`, run **inside Exegol**, +env-driven (`$WORKSPACE`), writing to `$WORKSPACE/scans/code/`. Each emits +`.json` (machine) + a `.txt`/`.md` summary (human). All best-effort: +a missing engine degrades and is logged, never aborts. + +### 1. `audit-sinks` — dangerous functions / sinks +- **semgrep**: `--config p/security-audit --config p/owasp-top-ten` (taint rules + included), `--json`. The primary, polyglot sink engine. +- **ast-grep**: a bundled polyglot ruleset of high-signal dangerous patterns + (`eval`/`exec`/deserialize/`innerHTML`/`dangerouslySetInnerHTML`/SQL concat/ + `system`/command exec, ...). **Invoke as `ast-grep`** — `/usr/bin/sg` is the + unrelated setgid tool, never call `sg`. +- **weggli** (optional, C/C++): semantic patterns (`strcpy`/`memcpy`/`system`/ + `sprintf`/format strings). Text output (weggli has no JSON); folded into the txt. +- Output: `sinks.json` (semgrep + ast-grep merged) + `sinks.txt`. + +### 2. `audit-endpoints` — attack surface +- **Backend routes**: ast-grep ruleset per framework (express, flask/fastapi, + spring, rails, go `http.HandleFunc`, php routers). +- **Frontend**: `xnLinkFinder` on `.js` + URL / `fetch` / `axios` / XHR literals. +- Output: `endpoints.json` (`[{kind, method, path, file, line}]`) + `endpoints.txt`. + +### 3. `audit-code` (existing, kept) — secrets + SAST baseline + SCA +trufflehog + gitleaks + semgrep (`auto`) + osv-scanner. Already JSON. Unchanged. + +### 4. `audit-hotspots` — aggregator / ranker +- `python3` (stdlib only, type-hinted, docstring'd). Reads every + `scans/code/*.json`, ranks candidates by signal (semgrep severity, verified + secrets, endpoint-without-nearby-auth heuristic, sink class), emits + `hotspots.json` (sorted) + `hotspots.md` (grouped table; operator entry point + and AI summary in one). +- Excluded from shellcheck (it is python, not bash) via pre-commit `exclude`. + +### 5. `audit-full` — orchestrator +Runs 1 → 4 best-effort (trivial glue). + +## Data flow + +``` +code/ ──► audit-sinks ─┐ + ──► audit-endpoints ─┤──► scans/code/*.json ──► audit-hotspots ──► scans/code/hotspots.{json,md} + ──► audit-code ─┘ +``` + +## Install (`load_user_setup`, idempotent best-effort) + +- **ast-grep**: `npm install -g @ast-grep/cli` (binary `ast-grep`). +- **weggli**: `cargo install weggli --locked` (C/C++; slow, best-effort). +- semgrep / trufflehog / gitleaks / osv-scanner / xnLinkFinder / python3 / jq are + already provisioned (base image or earlier installer work). + +## Bundled assets + +`exegol/my-resources/audit-rules/` — ast-grep `sgconfig.yml` + rule files +(`sinks/`, `endpoints/`). `deploy.sh` copies the directory to +`~/.exegol/my-resources/audit-rules/` (→ `/opt/my-resources/audit-rules/`). +Scripts reference `${MYRES:-/opt/my-resources}/audit-rules`. + +## Integration + +- `deploy.sh`: the `audit-*` bin glob (PR #16) already covers the new scripts; + add a copy step for `audit-rules/`. +- `aliases.dotsec` / `history.dotsec`: add `audit-sinks`, `audit-endpoints`, + `audit-hotspots`, `audit-full`. +- Engagement tree: `scans/code/` already exists (PR #16). +- Lint: `.pre-commit-config.yaml` + `make lint` exclude `audit-hotspots` from + shellcheck (python). + +## Testing + +- **bats**: a fixture dir with a known sink (`eval(userInput)`) and a known route + (`app.get('/x', ...)`); assert `audit-sinks` flags the `eval` and + `audit-endpoints` lists `/x`. `bash -n` + shellcheck on all bash scripts. +- **audit-hotspots**: python smoke over sample JSON inputs (deterministic ranking). + +## Honest limits + +- ast-grep / semgrep custom rules are starter sets, not exhaustive; false negatives + expected. The covered rule scope is logged, not silently truncated. +- weggli is C/C++ only; skipped elsewhere. +- "authz / IDOR candidate" detection is heuristic (endpoint with no nearby auth + guard; object access by id with no owner check). These are leads, not findings. diff --git a/exegol/my-resources/audit-rules/astgrep-sinks/dangerous.yml b/exegol/my-resources/audit-rules/astgrep-sinks/dangerous.yml new file mode 100644 index 0000000..3146366 --- /dev/null +++ b/exegol/my-resources/audit-rules/astgrep-sinks/dangerous.yml @@ -0,0 +1,20 @@ +id: js-eval +language: javascript +severity: warning +message: dynamic eval (code-injection sink) +rule: + pattern: eval($A) +--- +id: js-innerhtml +language: javascript +severity: warning +message: innerHTML assignment (DOM XSS sink) +rule: + pattern: $X.innerHTML = $A +--- +id: py-os-system +language: python +severity: warning +message: os.system (command-injection sink) +rule: + pattern: os.system($A) diff --git a/exegol/my-resources/audit-rules/endpoints.yml b/exegol/my-resources/audit-rules/endpoints.yml new file mode 100644 index 0000000..8b0ab99 --- /dev/null +++ b/exegol/my-resources/audit-rules/endpoints.yml @@ -0,0 +1,44 @@ +# dotsec audit: backend route/handler definitions across common frameworks. +# Used by audit-endpoints (semgrep --config). INFO severity: these are surface, +# not vulns. +rules: + - id: express-route + languages: [javascript, typescript] + severity: INFO + message: "express/koa route" + patterns: + - pattern-either: + - pattern: $APP.get("$P", ...) + - pattern: $APP.post("$P", ...) + - pattern: $APP.put("$P", ...) + - pattern: $APP.delete("$P", ...) + - pattern: $APP.patch("$P", ...) + - pattern: $APP.all("$P", ...) + - id: flask-fastapi-route + languages: [python] + severity: INFO + message: "flask/fastapi route" + patterns: + - pattern-either: + - pattern: "@$APP.route(\"$P\", ...)" + - pattern: "@$APP.get(\"$P\", ...)" + - pattern: "@$APP.post(\"$P\", ...)" + - pattern: "@$ROUTER.get(\"$P\", ...)" + - pattern: "@$ROUTER.post(\"$P\", ...)" + - id: spring-mapping + languages: [java] + severity: INFO + message: "spring request mapping" + patterns: + - pattern-either: + - pattern: "@RequestMapping(...)" + - pattern: "@GetMapping(...)" + - pattern: "@PostMapping(...)" + - id: go-http-handler + languages: [go] + severity: INFO + message: "go http handler" + patterns: + - pattern-either: + - pattern: $MUX.HandleFunc("$P", ...) + - pattern: http.HandleFunc("$P", ...) diff --git a/exegol/my-resources/audit-rules/sgconfig-sinks.yml b/exegol/my-resources/audit-rules/sgconfig-sinks.yml new file mode 100644 index 0000000..65aca69 --- /dev/null +++ b/exegol/my-resources/audit-rules/sgconfig-sinks.yml @@ -0,0 +1,2 @@ +ruleDirs: + - astgrep-sinks diff --git a/exegol/my-resources/bin/audit-endpoints b/exegol/my-resources/bin/audit-endpoints new file mode 100755 index 0000000..dc93e0f --- /dev/null +++ b/exegol/my-resources/bin/audit-endpoints @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail +WS="${WORKSPACE:-$PWD}" +CODE="${1:-$WS/code}" +OUTD="$WS/scans/code" +RULES="${MYRES:-/opt/my-resources}/audit-rules" +mkdir -p "$OUTD" + +# Attack-surface extraction: backend routes + frontend API calls. +[[ -d "$CODE" ]] || { echo "[!] no dir: $CODE"; exit 1; } +[[ -n "$(find "$CODE" -type f -print -quit 2>/dev/null)" ]] \ + || { echo "[!] $CODE is empty"; exit 1; } +echo "[*] endpoint scan $CODE" + +# 1. Backend routes via semgrep custom rules. +if command -v semgrep >/dev/null 2>&1 && [[ -f "$RULES/endpoints.yml" ]]; then + semgrep scan --config "$RULES/endpoints.yml" \ + --json -o "$OUTD/endpoints.json" "$CODE" >/dev/null 2>&1 || true +fi + +# 2. Frontend: endpoints referenced from JS + raw URL/fetch/axios literals. +: > "$OUTD/endpoints.txt" +if command -v xnLinkFinder >/dev/null 2>&1; then + xnLinkFinder -i "$CODE" -sf "${DOMAIN:-}" -o "$OUTD/_xn.txt" >/dev/null 2>&1 || true + [[ -f "$OUTD/_xn.txt" ]] && cat "$OUTD/_xn.txt" >> "$OUTD/endpoints.txt" && rm -f "$OUTD/_xn.txt" +fi +grep -rhoEi "https?://[^\"'\` )]+|fetch\(|axios\.[a-z]+\(|XMLHttpRequest" "$CODE" 2>/dev/null \ + | sort -u >> "$OUTD/endpoints.txt" || true +sort -u "$OUTD/endpoints.txt" -o "$OUTD/endpoints.txt" 2>/dev/null || true +if [[ -f "$OUTD/endpoints.json" ]]; then + echo "[+] endpoints -> $OUTD/endpoints.json (+ endpoints.txt)" +else + echo "[+] endpoints -> $OUTD/endpoints.txt (semgrep absent)" +fi diff --git a/exegol/my-resources/bin/audit-full b/exegol/my-resources/bin/audit-full new file mode 100755 index 0000000..7b403ad --- /dev/null +++ b/exegol/my-resources/bin/audit-full @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail +WS="${WORKSPACE:-$PWD}" +CODE="${1:-$WS/code}" + +# Full white-box audit pass over the code/ zone. +echo "[*] full code audit for $CODE" +audit-code "$CODE" || echo "[i] audit-code step skipped" +audit-sinks "$CODE" || echo "[i] sinks step skipped" +audit-endpoints "$CODE" || echo "[i] endpoints step skipped" +audit-hotspots || echo "[i] hotspots step skipped" +echo "[+] done -> ${WS}/scans/code (see hotspots.md)" diff --git a/exegol/my-resources/bin/audit-hotspots b/exegol/my-resources/bin/audit-hotspots new file mode 100755 index 0000000..bbacb1a --- /dev/null +++ b/exegol/my-resources/bin/audit-hotspots @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +"""Aggregate the audit JSON artifacts in scans/code/ into a ranked hotspots +report (hotspots.json + hotspots.md). Pure stdlib; reads whatever is present.""" +from __future__ import annotations + +import json +import os +from pathlib import Path + +SEVERITY_SCORE = {"ERROR": 4, "WARNING": 3, "HIGH": 4, "MEDIUM": 3, "LOW": 2, "INFO": 1} + + +def load_json(path: Path) -> object | None: + """Return parsed JSON, or None if missing/unparseable.""" + try: + return json.loads(path.read_text()) + except Exception: + return None + + +def load_jsonl(path: Path) -> list[dict]: + """Return a list of objects from a JSON-lines file (trufflehog format).""" + out: list[dict] = [] + try: + for line in path.read_text().splitlines(): + line = line.strip() + if line: + out.append(json.loads(line)) + except Exception: + pass + return out + + +def from_semgrep(data: object, category: str) -> list[dict]: + """Flatten a semgrep JSON report into scored findings.""" + findings: list[dict] = [] + results = data.get("results", []) if isinstance(data, dict) else [] + for r in results: + extra = r.get("extra", {}) + sev = str(extra.get("severity", "INFO")).upper() + findings.append({ + "category": category, + "rule": r.get("check_id", "?"), + "file": r.get("path", "?"), + "line": r.get("start", {}).get("line", 0), + "message": str(extra.get("message", ""))[:200], + "score": SEVERITY_SCORE.get(sev, 1), + }) + return findings + + +def from_trufflehog(records: list[dict]) -> list[dict]: + """Verified secrets rank highest; unverified are still surfaced.""" + findings: list[dict] = [] + for rec in records: + verified = bool(rec.get("Verified")) + meta = rec.get("SourceMetadata", {}).get("Data", {}).get("Filesystem", {}) + findings.append({ + "category": "secret-verified" if verified else "secret", + "rule": rec.get("DetectorName", "secret"), + "file": meta.get("file", "?"), + "line": meta.get("line", 0), + "message": "verified live credential" if verified else "potential secret", + "score": 5 if verified else 2, + }) + return findings + + +ASTGREP_SCORE = {"error": 4, "warning": 3, "info": 1, "hint": 1} + + +def from_astgrep(data: object) -> list[dict]: + """Flatten an ast-grep --json report (array of matches) into scored sinks. + ast-grep reports 0-indexed line numbers, so the line is shifted to 1-indexed.""" + findings: list[dict] = [] + matches = data if isinstance(data, list) else [] + for m in matches: + sev = str(m.get("severity", "warning")).lower() + line = m.get("range", {}).get("start", {}).get("line", 0) + findings.append({ + "category": "sink", + "rule": m.get("ruleId", "?"), + "file": m.get("file", "?"), + "line": int(line) + 1, + "message": str(m.get("message", ""))[:200], + "score": ASTGREP_SCORE.get(sev, 3), + }) + return findings + + +def main() -> int: + outd = Path(os.environ.get("WORKSPACE", ".")) / "scans" / "code" + findings: list[dict] = [] + findings += from_semgrep(load_json(outd / "sinks.json"), "sink") + findings += from_semgrep(load_json(outd / "endpoints.json"), "endpoint") + findings += from_trufflehog(load_jsonl(outd / "secrets_trufflehog.json")) + findings += from_astgrep(load_json(outd / "sinks_astgrep.json")) + + findings.sort(key=lambda f: (f["score"], f["category"]), reverse=True) + + outd.mkdir(parents=True, exist_ok=True) + (outd / "hotspots.json").write_text(json.dumps(findings, indent=2)) + + lines = ["# Code audit hotspots", "", f"{len(findings)} candidates (ranked).", ""] + lines += ["| score | category | rule | file:line | note |", + "|------:|----------|------|-----------|------|"] + def _md(value: object) -> str: + """Escape a value so it can't break a markdown table cell.""" + return str(value).replace("|", "\\|").replace("\n", " ").replace("`", "'") + + for f in findings: + lines.append(f"| {f['score']} | {f['category']} | `{_md(f['rule'])}` | " + f"`{_md(f['file'])}:{f['line']}` | {_md(f['message'])} |") + (outd / "hotspots.md").write_text("\n".join(lines) + "\n") + print(f"[+] {len(findings)} hotspots -> {outd}/hotspots.json (+ hotspots.md)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/exegol/my-resources/bin/audit-sinks b/exegol/my-resources/bin/audit-sinks new file mode 100755 index 0000000..248dcba --- /dev/null +++ b/exegol/my-resources/bin/audit-sinks @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -euo pipefail +WS="${WORKSPACE:-$PWD}" +CODE="${1:-$WS/code}" +OUTD="$WS/scans/code" +RULES="${MYRES:-/opt/my-resources}/audit-rules" +mkdir -p "$OUTD" + +# Dangerous-function / sink scan over the recovered code. +[[ -d "$CODE" ]] || { echo "[!] no dir: $CODE"; exit 1; } +[[ -n "$(find "$CODE" -type f -print -quit 2>/dev/null)" ]] \ + || { echo "[!] $CODE is empty — run recon-sourcemaps / recon-loot first"; exit 1; } +echo "[*] sink scan $CODE" +: > "$OUTD/sinks.txt" + +# 1. semgrep security rulesets (primary, polyglot, JSON). +if command -v semgrep >/dev/null 2>&1; then + semgrep scan --config p/security-audit --config p/owasp-top-ten \ + --json -o "$OUTD/sinks.json" "$CODE" >/dev/null 2>&1 || true + echo "semgrep: p/security-audit + p/owasp-top-ten -> sinks.json" >> "$OUTD/sinks.txt" +fi + +# 2. ast-grep custom polyglot patterns (best-effort). Binary is `ast-grep`, +# NEVER `sg` (/usr/bin/sg is the unrelated setgid tool). +if command -v ast-grep >/dev/null 2>&1 && [[ -f "$RULES/sgconfig-sinks.yml" ]]; then + ast-grep scan -c "$RULES/sgconfig-sinks.yml" --json "$CODE" \ + > "$OUTD/sinks_astgrep.json" 2>/dev/null || true + echo "ast-grep: $RULES/sgconfig-sinks.yml -> sinks_astgrep.json" >> "$OUTD/sinks.txt" +fi + +# 3. weggli C/C++ semantic patterns (best-effort, C/C++ only). +if command -v weggli >/dev/null 2>&1; then + for pat in 'strcpy(_,_);' 'memcpy(_,_,_);' 'system(_);' 'sprintf(_,_);' 'gets(_);'; do + weggli "{ $pat }" "$CODE" 2>/dev/null >> "$OUTD/sinks.txt" || true + done +fi +if [[ -f "$OUTD/sinks.json" ]]; then + echo "[+] sinks -> $OUTD/sinks.json (+ sinks.txt)" +else + echo "[+] sinks -> $OUTD/sinks.txt (semgrep absent)" +fi diff --git a/exegol/my-resources/deploy.sh b/exegol/my-resources/deploy.sh index 18381ad..f92dcf1 100755 --- a/exegol/my-resources/deploy.sh +++ b/exegol/my-resources/deploy.sh @@ -30,6 +30,11 @@ done [[ -f "${SRC}/bin/dl" ]] && cp "${SRC}/bin/dl" "${DEST}/bin/" chmod +x "${DEST}/bin/"* 2>/dev/null || true +if [[ -d "${SRC}/audit-rules" ]]; then + mkdir -p "${DEST}/audit-rules" + cp -r "${SRC}/audit-rules/." "${DEST}/audit-rules/" +fi + merge_block "${DEST}/setup/zsh/aliases" "${SRC}/fragments/aliases.dotsec" merge_block "${DEST}/setup/zsh/history" "${SRC}/fragments/history.dotsec" merge_block "${DEST}/setup/load_user_setup.sh" "${SRC}/fragments/load_user_setup.dotsec.sh" diff --git a/exegol/my-resources/fragments/aliases.dotsec b/exegol/my-resources/fragments/aliases.dotsec index cb4621a..8655944 100644 --- a/exegol/my-resources/fragments/aliases.dotsec +++ b/exegol/my-resources/fragments/aliases.dotsec @@ -10,3 +10,6 @@ alias shots='recon-screenshot' alias scan='scan-nuclei' alias takeover='scan-takeover' alias audit='audit-code' +alias sinks='audit-sinks' +alias endpoints='audit-endpoints' +alias hotspots='audit-hotspots' diff --git a/exegol/my-resources/fragments/history.dotsec b/exegol/my-resources/fragments/history.dotsec index 9c4d41c..19231ea 100644 --- a/exegol/my-resources/fragments/history.dotsec +++ b/exegol/my-resources/fragments/history.dotsec @@ -15,3 +15,6 @@ : 0:0;gowitness scan file -f subdomains_alive.txt --screenshot-path recon/screenshots --write-none : 0:0;osv-scanner scan --recursive code/ : 0:0;semgrep --config auto code/ +: 0:0;audit-full code/ +: 0:0;audit-sinks code/ && audit-hotspots +: 0:0;semgrep scan --config p/security-audit --json code/ diff --git a/exegol/my-resources/fragments/load_user_setup.dotsec.sh b/exegol/my-resources/fragments/load_user_setup.dotsec.sh index 5478ef9..5d4b537 100644 --- a/exegol/my-resources/fragments/load_user_setup.dotsec.sh +++ b/exegol/my-resources/fragments/load_user_setup.dotsec.sh @@ -60,4 +60,11 @@ _dotsec_release osv-scanner \ https://github.com/google/osv-scanner/releases/latest/download/osv-scanner_linux_amd64 \ https://github.com/google/osv-scanner/releases/latest/download/osv-scanner_linux_arm64 +# ── code audit: structural search engines (audit-sinks/audit-endpoints) ── +# ast-grep installs the `ast-grep` binary (it also ships `sg`, but /usr/bin/sg is +# the unrelated setgid tool — scripts always call `ast-grep`). +_dotsec_have ast-grep || npm install -g @ast-grep/cli >/dev/null 2>&1 || true +# weggli: C/C++ semantic grep (slow cargo build, best-effort). +_dotsec_have weggli || cargo install weggli --locked >/dev/null 2>&1 || true + echo "[dotsec] tooling ready (go-built extras skipped if go < 1.23 — covered by fallbacks)." diff --git a/tests/audit.bats b/tests/audit.bats new file mode 100644 index 0000000..67801b0 --- /dev/null +++ b/tests/audit.bats @@ -0,0 +1,28 @@ +#!/usr/bin/env bats +setup() { + BIN="${BATS_TEST_DIRNAME}/../exegol/my-resources/bin" + export PATH="${BATS_TEST_DIRNAME}/stubs:${PATH}" + WS="$(mktemp -d)"; export WORKSPACE="$WS" + mkdir -p "$WS/code"; echo 'eval(x)' > "$WS/code/a.js" +} +teardown() { rm -rf "$WS"; } + +@test "audit-sinks writes sinks.json from semgrep" { + run "$BIN/audit-sinks" + [ "$status" -eq 0 ] + [ -f "$WS/scans/code/sinks.json" ] + grep -q "js-eval" "$WS/scans/code/sinks.json" +} + +@test "audit-sinks errors on empty code dir" { + rm -rf "$WS/code"; mkdir -p "$WS/code" + run "$BIN/audit-sinks" + [ "$status" -ne 0 ] +} + +@test "audit-endpoints writes endpoints.json from semgrep" { + export MYRES="${BATS_TEST_DIRNAME}/../exegol/my-resources" + run "$BIN/audit-endpoints" + [ "$status" -eq 0 ] + [ -f "$WS/scans/code/endpoints.json" ] +} diff --git a/tests/stubs/semgrep b/tests/stubs/semgrep new file mode 100755 index 0000000..21254ba --- /dev/null +++ b/tests/stubs/semgrep @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# stub: emit a fixed semgrep JSON report to the -o path +out="" +while [[ $# -gt 0 ]]; do case "$1" in -o) out="$2"; shift 2;; *) shift;; esac; done +cat > "$out" <<'JSON' +{"results":[{"check_id":"js-eval","path":"a.js","start":{"line":3},"extra":{"severity":"ERROR","message":"eval sink"}}]} +JSON diff --git a/tests/test_audit_hotspots.py b/tests/test_audit_hotspots.py new file mode 100644 index 0000000..bc2124b --- /dev/null +++ b/tests/test_audit_hotspots.py @@ -0,0 +1,67 @@ +import json, os, subprocess, sys +from pathlib import Path + +SCRIPT = Path(__file__).resolve().parent.parent / "exegol/my-resources/bin/audit-hotspots" + +def _run(ws: Path): + env = {**os.environ, "WORKSPACE": str(ws)} + return subprocess.run([sys.executable, str(SCRIPT)], env=env, capture_output=True, text=True) + +def test_ranks_error_above_info(tmp_path): + outd = tmp_path / "scans" / "code"; outd.mkdir(parents=True) + (outd / "sinks.json").write_text(json.dumps({"results": [ + {"check_id": "eval-injection", "path": "a.js", "start": {"line": 5}, + "extra": {"severity": "ERROR", "message": "eval sink"}}, + {"check_id": "weak-rng", "path": "b.js", "start": {"line": 9}, + "extra": {"severity": "INFO", "message": "weak rng"}}, + ]})) + r = _run(tmp_path) + assert r.returncode == 0, r.stderr + ranked = json.loads((outd / "hotspots.json").read_text()) + assert [f["rule"] for f in ranked] == ["eval-injection", "weak-rng"] + assert (outd / "hotspots.md").exists() + +def test_verified_secret_outranks_sink(tmp_path): + outd = tmp_path / "scans" / "code"; outd.mkdir(parents=True) + (outd / "sinks.json").write_text(json.dumps({"results": [ + {"check_id": "eval-injection", "path": "a.js", "start": {"line": 5}, + "extra": {"severity": "ERROR", "message": "eval"}}]})) + (outd / "secrets_trufflehog.json").write_text( + json.dumps({"SourceMetadata": {"Data": {"Filesystem": {"file": "c.env"}}}, + "DetectorName": "AWS", "Verified": True}) + "\n") + r = _run(tmp_path) + assert r.returncode == 0, r.stderr + ranked = json.loads((outd / "hotspots.json").read_text()) + assert ranked[0]["category"] == "secret-verified" + +def test_no_inputs_is_clean(tmp_path): + (tmp_path / "scans" / "code").mkdir(parents=True) + r = _run(tmp_path) + assert r.returncode == 0 + assert json.loads((tmp_path / "scans/code/hotspots.json").read_text()) == [] + +def test_astgrep_sink_included(tmp_path): + outd = tmp_path / "scans" / "code"; outd.mkdir(parents=True) + (outd / "sinks_astgrep.json").write_text(json.dumps([ + {"ruleId": "js-eval", "severity": "warning", "file": "app.js", + "range": {"start": {"line": 0, "column": 0}}, "message": "dynamic eval"} + ])) + r = _run(tmp_path) + assert r.returncode == 0, r.stderr + ranked = json.loads((outd / "hotspots.json").read_text()) + assert ranked[0]["rule"] == "js-eval" + assert ranked[0]["category"] == "sink" + assert ranked[0]["line"] == 1 # ast-grep 0-indexed -> 1-indexed + +def test_markdown_escapes_pipe_and_newline(tmp_path): + outd = tmp_path / "scans" / "code"; outd.mkdir(parents=True) + (outd / "sinks.json").write_text(json.dumps({"results": [ + {"check_id": "r", "path": "a.js", "start": {"line": 1}, + "extra": {"severity": "ERROR", "message": "bad | line\nbreak"}}]})) + r = _run(tmp_path) + assert r.returncode == 0, r.stderr + md = (outd / "hotspots.md").read_text() + data_rows = [ln for ln in md.splitlines() + if ln.startswith("| ") and "score |" not in ln and "---" not in ln] + assert len(data_rows) == 1 # a newline in a field must not split the row + assert "\\|" in md # the literal pipe must be escaped