diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 951ca2ea1..3045d61cc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -59,6 +59,11 @@ jobs: - '@percy/monitoring' - '@percy/cli-doctor' runs-on: ${{ matrix.os }} + # Collect failed node-test spec names so retries re-run only the specs that + # flaked instead of the whole suite. Non-PERCY_ name so it doesn't trip + # cli-doctor's env-audit tests. See PER-9011. + env: + CLI_TEST_FAILURES_FILE: ${{ github.workspace }}/.cli-test-failures.json steps: - uses: actions/checkout@v5 - uses: actions/setup-node@v3 @@ -87,8 +92,48 @@ jobs: sudo apt-get update sudo apt-get install -y --fix-missing libgbm-dev if: ${{ matrix.os == 'ubuntu-latest' }} + # First attempt runs the full suite WITH coverage (enforces the 100% + # gate) and records any failed specs. - name: Run tests + continue-on-error: true + id: retry0 run: yarn workspace ${{ matrix.package }} test:coverage --colors + # Retries re-run ONLY the specs that failed in the previous attempt, and + # WITHOUT coverage (a subset can't hit the 100% threshold). If retry0 + # failed with no recorded spec failures (e.g. a real coverage drop), the + # runner preserves that failure instead of masking it. See PER-9011. + - name: Run tests Retry (1/4) + continue-on-error: true + id: retry1 + if: steps.retry0.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' + run: yarn workspace ${{ matrix.package }} test --colors + - name: Run tests Retry (2/4) + continue-on-error: true + id: retry2 + if: steps.retry1.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' + run: yarn workspace ${{ matrix.package }} test --colors + - name: Run tests Retry (3/4) + continue-on-error: true + id: retry3 + if: steps.retry2.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' + run: yarn workspace ${{ matrix.package }} test --colors + - name: Run tests Retry (4/4) + id: retry4 + if: steps.retry3.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' + run: yarn workspace ${{ matrix.package }} test --colors + # Keep "green via retry" honest: surface a warning whenever the first + # attempt failed and a retry recovered it, so flakiness stays visible. + - name: Flag flaky tests + if: steps.retry0.outcome=='failure' + run: echo "::warning title=Flaky tests::${{ matrix.package }} flaked on the first attempt and was recovered by a spec-level retry (tracked in PER-9011)." regression: name: Regression diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 7028a2867..8aa1ba4c2 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -58,6 +58,11 @@ jobs: - '@percy/monitoring' - '@percy/cli-doctor' runs-on: windows-latest + # Collect failed node-test spec names here so retries can re-run only the + # specs that flaked instead of the whole ~60-min suite. See PER-9011. + # Name is deliberately non-PERCY_ so it doesn't trip env-audit tests. + env: + CLI_TEST_FAILURES_FILE: ${{ github.workspace }}/.cli-test-failures.json steps: - uses: actions/checkout@v5 - uses: actions/setup-node@v3 @@ -81,26 +86,42 @@ jobs: name: dist path: packages - run: yarn + # First attempt runs the full suite and records any failed specs. - name: Run tests continue-on-error: true id: retry0 run: yarn workspace ${{ matrix.package }} test --colors - - name: Run tests Retry (1/3) + # Retries re-run ONLY the specs that failed in the previous attempt + # (CLI_TEST_ONLY_FAILED), so a flaky spec costs seconds, not ~60 min. + - name: Run tests Retry (1/4) continue-on-error: true id: retry1 if: steps.retry0.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' run: yarn workspace ${{ matrix.package }} test --colors - name: Run tests Retry (2/4) continue-on-error: true id: retry2 if: steps.retry1.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' run: yarn workspace ${{ matrix.package }} test --colors - name: Run tests Retry (3/4) continue-on-error: true id: retry3 if: steps.retry2.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' run: yarn workspace ${{ matrix.package }} test --colors - name: Run tests Retry (4/4) id: retry4 if: steps.retry3.outcome=='failure' + env: + CLI_TEST_ONLY_FAILED: '1' run: yarn workspace ${{ matrix.package }} test --colors + # Keep "green via retry" honest: surface a warning whenever the first + # attempt failed and a retry recovered it, so flakiness stays visible. + - name: Flag flaky tests + if: steps.retry0.outcome=='failure' + run: echo "::warning title=Flaky Windows tests::${{ matrix.package }} flaked on the first attempt and was recovered by a spec-level retry (tracked in PER-9011)." diff --git a/.gitignore b/.gitignore index bb9542884..a050b3ede 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,6 @@ bstack-ai-harness.yml CLAUDE.md .claude/ # bstack-ai-harness:end + +# spec-level retry bookkeeping (CI only, see PER-9011) +.cli-test-failures.json diff --git a/scripts/test.js b/scripts/test.js index b733898b8..64cde2f8a 100644 --- a/scripts/test.js +++ b/scripts/test.js @@ -125,8 +125,63 @@ async function main({ } })); + // Spec-level retry support for flaky environments (notably Windows CI, + // where ~1 spec out of 1000+ flakes per run on browser/server timing). + // When CLI_TEST_FAILURES_FILE is set we record every failed spec name; + // a follow-up run with CLI_TEST_ONLY_FAILED=1 re-runs only those specs, + // turning a ~60-min full-suite retry into a few seconds. See PER-9011. + // NB: these env vars are intentionally NOT prefixed with PERCY_ so they do + // not leak into env-audit / Percy-env detection tests (cli-doctor). + let failuresFile = process.env.CLI_TEST_FAILURES_FILE; + let onlyFailed = process.env.CLI_TEST_ONLY_FAILED === '1'; + let priorFailures = []; + + if (onlyFailed && failuresFile && fs.existsSync(failuresFile)) { + try { priorFailures = JSON.parse(fs.readFileSync(failuresFile, 'utf8')); } catch { /* treated as no recorded failures below */ } + } + + // A retry was requested but the previous run recorded no failed specs. That + // means the failure was not a flaky spec (e.g. a coverage-threshold failure + // or a crash) -- re-running a subset would mask it, so preserve the failure. + if (onlyFailed && !priorFailures.length) { + console.log(colors.yellow('No recorded spec failures to retry — preserving the previous failure.')); + process.exit(1); + } + + if (onlyFailed) { + let retrySet = new Set(priorFailures); + jasmine.env.configure({ specFilter: spec => retrySet.has(spec.getFullName()) }); + console.log(colors.yellow(`Re-running ${priorFailures.length} previously-failed spec(s)...\n`)); + } + + let failures = []; + jasmine.addReporter({ + specDone: result => result.status === 'failed' && failures.push(result.fullName) + }); + console.log(colors.magenta('Running node tests...\n')); - await jasmine.execute(); + + // manage the exit code ourselves so failures can be persisted first + jasmine.exitOnCompletion = false; + let result; + + try { + result = await jasmine.execute(); + } finally { + if (failuresFile) { + try { fs.writeFileSync(failuresFile, JSON.stringify(failures)); } catch { /* best-effort */ } + } + } + + // When re-running only previously-failed specs, jasmine reports the run as + // "incomplete" (the rest are intentionally skipped) — so success here means + // the targeted specs passed, i.e. no new failures. A normal full run keeps + // jasmine's strict status (incomplete/failed both count as failure). + let passed = onlyFailed + ? failures.length === 0 + : (result ? result.overallStatus === 'passed' : failures.length === 0); + + process.exit(passed ? 0 : 1); } else if (testBrowsers) { // $ karma start --config /karma.config.js let { default: Karma } = await import('karma');