-
Notifications
You must be signed in to change notification settings - Fork 728
feat: vulnerability scanning within git integration (IN-956) #3892
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
803e49e
28a077b
bcf68d3
39f8935
ed3308a
a6c7fd4
fda4ee8
670e779
1458167
572449e
91403d5
6c57c4a
7ec4c04
b4e9df5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -167,6 +167,7 @@ async def run_shell_command( | |
| cwd: str = None, | ||
| timeout: float | None = None, | ||
| input_text: str | bytes | None = None, | ||
| stderr_logger=None, | ||
| ) -> str: | ||
| """ | ||
| Run shell command asynchronously and return output on success, raise exception on failure. | ||
|
|
@@ -176,6 +177,7 @@ async def run_shell_command( | |
| cwd: Working directory | ||
| timeout: Command timeout in seconds | ||
| input_text: Text (str) or bytes to send to stdin (will automatically append newline if not present) | ||
| stderr_logger: If provided, stderr lines are streamed in real-time and logged via this callable | ||
|
|
||
| Returns: | ||
| str: Command stdout output | ||
|
|
@@ -219,17 +221,37 @@ async def run_shell_command( | |
| input_text += "\n" | ||
| stdin_input = input_text.encode("utf-8") | ||
|
|
||
| # Wait for completion with optional timeout | ||
| if timeout: | ||
| stdout, stderr = await asyncio.wait_for( | ||
| process.communicate(input=stdin_input), timeout=timeout | ||
| ) | ||
| if stderr_logger: | ||
| stderr_lines: list[str] = [] | ||
|
|
||
| async def _run_with_stderr_logging() -> bytes: | ||
| async def _stream() -> None: | ||
| async for raw_line in process.stderr: | ||
| line = _safe_decode(raw_line).rstrip() | ||
| if line: | ||
| stderr_logger.info(line) | ||
| stderr_lines.append(line) | ||
|
|
||
| stdout, _ = await asyncio.gather(process.stdout.read(), _stream()) | ||
| await process.wait() | ||
| return stdout | ||
|
|
||
| coro = _run_with_stderr_logging() | ||
| stdout = await (asyncio.wait_for(coro, timeout=timeout) if timeout else coro) | ||
| stdout_text = _safe_decode(stdout).strip() if stdout else "" | ||
| stderr_text = "\n".join(stderr_lines) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. stdin_input silently dropped when stderr_logger is providedMedium Severity When |
||
| else: | ||
| stdout, stderr = await process.communicate(input=stdin_input) | ||
| # Wait for completion with optional timeout | ||
| if timeout: | ||
| stdout, stderr = await asyncio.wait_for( | ||
| process.communicate(input=stdin_input), timeout=timeout | ||
| ) | ||
| else: | ||
| stdout, stderr = await process.communicate(input=stdin_input) | ||
|
|
||
| # Handle potentially non-UTF-8 encoded output from git commands | ||
| stdout_text = _safe_decode(stdout).strip() if stdout else "" | ||
| stderr_text = _safe_decode(stderr).strip() if stderr else "" | ||
| # Handle potentially non-UTF-8 encoded output from git commands | ||
| stdout_text = _safe_decode(stdout).strip() if stdout else "" | ||
| stderr_text = _safe_decode(stderr).strip() if stderr else "" | ||
|
|
||
| # Check return code | ||
| if process.returncode == 0: | ||
|
|
@@ -248,8 +270,11 @@ async def run_shell_command( | |
| logger.error(f"Permission error: {stderr_text}") | ||
| raise PermissionError(f"Permission denied while running: {command_str}") | ||
| else: | ||
| logger.error(f"Command error: {stderr_text}") | ||
| raise CommandExecutionError(f"Command failed: {command_str} - {stderr_text}") | ||
| logger.error(f"Command failed (exit {process.returncode}): {stderr_text}") | ||
| raise CommandExecutionError( | ||
| f"Command failed (exit {process.returncode}): {command_str} - {stderr_text}", | ||
| returncode=process.returncode, | ||
| ) | ||
|
|
||
| except asyncio.TimeoutError: | ||
| logger.error(f"Command timed out after {timeout}s: {command_str}") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| # Binaries | ||
| *.exe | ||
| *.exe~ | ||
| *.dll | ||
| *.so | ||
| *.dylib | ||
| vulnerability-scanner | ||
|
|
||
| # Test binary | ||
| *.test | ||
|
|
||
| # Coverage | ||
| *.out | ||
|
|
||
| # Go workspace | ||
| go.work | ||
|
|
||
| # Go dependencies (use vendor/ instead) | ||
| vendor/ | ||
|
|
||
| # Environment | ||
| .env |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,109 @@ | ||
| # Vulnerability Scanner | ||
|
|
||
| Scans a cloned git repository for known vulnerabilities using the [Google OSV Scanner SDK](https://pkg.go.dev/github.com/google/osv-scanner/v2/pkg/osvscanner) and persists the results to the insights database. | ||
|
|
||
| ## How it fits into the pipeline | ||
|
|
||
| The repository worker processes repos through a fixed sequence of services per clone batch: | ||
|
|
||
| ``` | ||
| Clone → [first batch only] → SoftwareValue → VulnerabilityScanner → Maintainer → Commits → ... | ||
| ``` | ||
|
|
||
| It runs on the first batch only, when the full working directory is available. The Python worker calls the scanner via subprocess, passing the local repo path and the canonical repo URL. The Go binary does all the scanning and writes results directly to the database, then exits — the Python side only tracks whether it succeeded or failed. | ||
|
|
||
| ## Architecture: why a Go binary wrapped in Python | ||
|
|
||
| The rest of the git integration is Python, but the OSV Scanner is a Go library with no Python bindings. Rather than shelling out to the `osv-scanner` CLI (which is fragile and adds an extra process layer), we embed the scanner as a Go SDK dependency and call it programmatically. This gives us full control over scan parameters, result access, and error handling. | ||
|
|
||
| The binary follows the same **subprocess + JSON stdout** pattern used by the `software-value` service. The Python wrapper calls it, reads the JSON response from stdout, and treats it as a black box. The binary always exits with code 0 — errors are communicated through the JSON payload — so the Python subprocess machinery never misinterprets a non-zero exit as a crash. | ||
|
|
||
| ## Key design decisions | ||
|
|
||
| ### OSV Scanner uses sentinel errors for expected outcomes | ||
|
|
||
| OSV Scanner returns sentinel errors for two non-failure cases: | ||
|
|
||
| - `ErrVulnerabilitiesFound` — the scanner found vulnerabilities. Like `grep` returning exit code 1 on a match, this is an expected outcome. We treat it as success and surface the results normally. | ||
| - `ErrNoPackagesFound` — the repo contains no scannable package manifests (e.g. a pure C or shell project). This is not a failure — the scanner ran correctly and determined there is nothing to check. The scan record gets status `no_packages_found` and the Python side does not treat it as an error. | ||
|
|
||
| Any other non-nil error from the scanner is a real failure. | ||
|
|
||
| ### Vulnerability identity: (repo_url, vulnerability_id, package_name, source_path) | ||
|
|
||
| Each vulnerability is uniquely identified by these four fields. The same CVE can appear in multiple packages and multiple lockfiles, so all four are needed to distinguish records. This is the unique key for upserts. | ||
|
|
||
| ### vulnerability_id and the ID columns | ||
|
|
||
| Every vulnerability record has a `vulnerability_id` — the primary identifier OSV assigns to the advisory (e.g. `GHSA-xxxx-xxxx-xxxx`). GHSA is OSV's preferred canonical ID: even when a CVE exists, OSV typically uses the GHSA ID as the primary and lists the CVE as an alias. This is because GitHub's Advisory Database is OSV's main upstream source and GHSA IDs are stable and consistently present. | ||
|
|
||
| Beyond the primary ID, each advisory carries a list of aliases (cross-references to the same vulnerability in other databases). We take both the primary ID and all aliases and classify them into three `TEXT[]` columns by prefix: | ||
|
|
||
| - `cve_ids` — IDs starting with `CVE-` | ||
| - `ghsa_ids` — IDs starting with `GHSA-` (usually includes the primary ID) | ||
| - `other_ids` — everything else (e.g. `PYSEC-`, `GO-`, `RUSTSEC-`, `MAL-`) | ||
|
|
||
| Each column holds an array, so multiple IDs of the same type are stored as proper discrete values. | ||
|
|
||
| ### Severity derived from CVSS score, not OSV severity strings | ||
|
|
||
| OSV records include a `MaxSeverity` field as a CVSS numeric score. We map it to our own four-tier scale (CRITICAL / HIGH / MEDIUM / LOW) using standard CVSS thresholds rather than trusting the advisory's own severity label, which is inconsistently populated across ecosystems. | ||
|
|
||
| ### Status: OPEN vs FIX_AVAILABLE vs RESOLVED | ||
|
|
||
| A vulnerability is `FIX_AVAILABLE` if the OSV record contains a fixed version in any of the affected ranges — meaning a patch exists but the repo is still on the vulnerable version. It's `OPEN` if no fix is known. `RESOLVED` is set automatically by the database logic (see below) for findings that were present in a previous scan but are no longer detected. | ||
|
|
||
| The OSV scanner's flatten operation can occasionally produce multiple entries for the same (vulnerability_id, package_name, source_path) with different package versions — for example when a package appears more than once in the same lockfile. When that happens, we keep the smallest version, since that represents the worst-case exposure for that specific lockfile. | ||
|
|
||
| ### Database strategy: upsert + mark-resolved, not delete + insert | ||
|
|
||
| On each scan, rather than deleting all previous findings and inserting fresh ones, we: | ||
|
|
||
| 1. Mark all currently active findings for the repo as `RESOLVED`. | ||
| 2. Upsert the current scan results — this re-activates any finding that is still present and inserts new ones. Findings that genuinely disappeared remain `RESOLVED` with a timestamp. | ||
|
|
||
| This preserves history. You can tell when a vulnerability was first detected, when it was last seen, and when it went away — without needing a separate audit log. It also makes it cheap to compute `new_count` (rows that were truly inserted, not updated) and `resolved_count` (rows still carrying the newly-set `resolved_at` after the upsert pass). | ||
|
|
||
| ### Transitive dependency scanning | ||
|
|
||
| By default the scanner resolves the full transitive dependency graph, not just direct dependencies declared in lockfiles. This catches vulnerabilities in indirect deps that the project never explicitly references. | ||
|
|
||
| Transitive scanning is expensive for repos with large or deep package ecosystems — projects like ML frameworks with hundreds of transitive Python dependencies can exhaust the 3-minute scan timeout. The scanner handles this automatically: | ||
|
|
||
| 1. **First scan**: attempt with transitive=true. If it times out, retry immediately with transitive=false (direct deps only). The result of whichever mode succeeded is stored. | ||
| 2. **Subsequent scans**: reuse the same mode as the previous completed scan (`transitive_deps_scanned` stored per scan record). This avoids flip-flopping between modes and keeps results comparable across scans. | ||
|
|
||
|
|
||
| ### OOM handling and stale scan cleanup | ||
|
|
||
| If the Go binary is killed by the OOM killer (SIGKILL, exit code -9), the scan record it created stays stuck as `running` since the process never got a chance to finalize it. The Python wrapper detects this: | ||
|
|
||
| 1. On **any** `CommandExecutionError`, it connects to the insights DB and marks all `running` scans for that repo as `failure` with the error message. This cleans up stale records regardless of the failure reason. | ||
| 2. On **OOM specifically** (returncode -9), it retries the scan with the `--no-transitive` flag, which forces the Go binary to skip transitive dependency resolution — the most memory-intensive part of scanning. This gives large repos a second chance to complete within memory limits. | ||
|
|
||
| The `--no-transitive` flag is parsed by the Go binary and overrides the normal transitive mode selection logic. | ||
|
|
||
| ### Scan tracking | ||
|
|
||
| Every invocation creates a row in `vulnerability_scans` before the scan starts (status: `running`) and updates it on completion with duration, counts, and any error. Terminal statuses are `success`, `no_packages_found`, and `failure`. This makes it possible to detect stalled or crashed scans and gives a simple history of scan health per repo. | ||
|
|
||
| ## Building | ||
|
|
||
| The binary is built during Docker image construction (see `Dockerfile.git_integration`). To build locally: | ||
|
|
||
| ```bash | ||
| cd services/apps/git_integration/src/crowdgit/services/vulnerability_scanner | ||
| go build -o vulnerability-scanner . | ||
| ``` | ||
|
|
||
|
|
||
| Output is a JSON object on stdout: | ||
|
|
||
| ```json | ||
| { "status": "success", "error_code": null, "error_message": null } | ||
| ``` | ||
|
|
||
| On repos with no scannable package manifests: | ||
| ```json | ||
| { "status": "no_packages_found", "error_code": null, "error_message": null } | ||
| ``` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| package main | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "os" | ||
| "strconv" | ||
| ) | ||
|
|
||
| type DBConfig struct { | ||
| User string | ||
| Password string | ||
| DBName string | ||
| Host string | ||
| Port int | ||
| SSLMode string | ||
| PoolMax int | ||
| } | ||
|
|
||
| type Config struct { | ||
| TargetPath string | ||
| GitURL string | ||
| NoTransitive bool | ||
| InsightsDatabase DBConfig | ||
| } | ||
|
|
||
| func getConfig(targetPath, gitURL string) (Config, error) { | ||
| var config Config | ||
|
|
||
| config.TargetPath = targetPath | ||
| config.GitURL = gitURL | ||
|
|
||
| if _, err := os.Stat(config.TargetPath); os.IsNotExist(err) { | ||
| return config, fmt.Errorf("target path '%s' does not exist: %w", config.TargetPath, err) | ||
| } else if err != nil { | ||
| return config, fmt.Errorf("error checking target path '%s': %w", config.TargetPath, err) | ||
| } | ||
|
|
||
| config.InsightsDatabase.User = os.Getenv("INSIGHTS_DB_USERNAME") | ||
| config.InsightsDatabase.Password = os.Getenv("INSIGHTS_DB_PASSWORD") | ||
| config.InsightsDatabase.DBName = os.Getenv("INSIGHTS_DB_DATABASE") | ||
| config.InsightsDatabase.Host = os.Getenv("INSIGHTS_DB_WRITE_HOST") | ||
| if portStr := os.Getenv("INSIGHTS_DB_PORT"); portStr != "" { | ||
| if port, err := strconv.Atoi(portStr); err == nil { | ||
| config.InsightsDatabase.Port = port | ||
| } | ||
| } | ||
| config.InsightsDatabase.SSLMode = os.Getenv("INSIGHTS_DB_SSLMODE") | ||
| if poolMaxStr := os.Getenv("INSIGHTS_DB_POOL_MAX"); poolMaxStr != "" { | ||
| if poolMax, err := strconv.Atoi(poolMaxStr); err == nil { | ||
| config.InsightsDatabase.PoolMax = poolMax | ||
| } | ||
| } | ||
|
|
||
| return config, nil | ||
| } |


Uh oh!
There was an error while loading. Please reload this page.