percy · AkashBrowserStack · Jun 8, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 8, 2026
@@ -59,6 +59,11 @@ jobs:
           - '@percy/monitoring'
           - '@percy/cli-doctor'
     runs-on: ${{ matrix.os }}
+    # Collect failed node-test spec names so retries re-run only the specs that
+    # flaked instead of the whole suite. Non-PERCY_ name so it doesn't trip
+    # cli-doctor's env-audit tests. See PER-9011.
+    env:
+      CLI_TEST_FAILURES_FILE: ${{ github.workspace }}/.cli-test-failures.json
     steps:
       - uses: actions/checkout@v5
       - uses: actions/setup-node@v3
@@ -87,8 +92,48 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y --fix-missing libgbm-dev
         if: ${{ matrix.os == 'ubuntu-latest' }}
+      # First attempt runs the full suite WITH coverage (enforces the 100%
+      # gate) and records any failed specs.
       - name: Run tests
+        continue-on-error: true
+        id: retry0
         run: yarn workspace ${{ matrix.package }} test:coverage --colors
+      # Retries re-run ONLY the specs that failed in the previous attempt, and
+      # WITHOUT coverage (a subset can't hit the 100% threshold). If retry0
+      # failed with no recorded spec failures (e.g. a real coverage drop), the
+      # runner preserves that failure instead of masking it. See PER-9011.
+      - name: Run tests Retry (1/4)
+        continue-on-error: true
+        id: retry1
+        if: steps.retry0.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
+        run: yarn workspace ${{ matrix.package }} test --colors
+      - name: Run tests Retry (2/4)
+        continue-on-error: true
+        id: retry2
+        if: steps.retry1.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
+        run: yarn workspace ${{ matrix.package }} test --colors
+      - name: Run tests Retry (3/4)
+        continue-on-error: true
+        id: retry3
+        if: steps.retry2.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
+        run: yarn workspace ${{ matrix.package }} test --colors
+      - name: Run tests Retry (4/4)
+        id: retry4
+        if: steps.retry3.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
+        run: yarn workspace ${{ matrix.package }} test --colors
+      # Keep "green via retry" honest: surface a warning whenever the first
+      # attempt failed and a retry recovered it, so flakiness stays visible.
+      - name: Flag flaky tests
+        if: steps.retry0.outcome=='failure'
+        run: echo "::warning title=Flaky tests::${{ matrix.package }} flaked on the first attempt and was recovered by a spec-level retry (tracked in PER-9011)."
 
   regression:
     name: Regression

@@ -58,6 +58,11 @@ jobs:
           - '@percy/monitoring'
           - '@percy/cli-doctor'
     runs-on: windows-latest
+    # Collect failed node-test spec names here so retries can re-run only the
+    # specs that flaked instead of the whole ~60-min suite. See PER-9011.
+    # Name is deliberately non-PERCY_ so it doesn't trip env-audit tests.
+    env:
+      CLI_TEST_FAILURES_FILE: ${{ github.workspace }}/.cli-test-failures.json
     steps:
       - uses: actions/checkout@v5
       - uses: actions/setup-node@v3
@@ -81,26 +86,42 @@ jobs:
           name: dist
           path: packages
       - run: yarn
+      # First attempt runs the full suite and records any failed specs.
       - name: Run tests
         continue-on-error: true
         id: retry0
         run: yarn workspace ${{ matrix.package }} test --colors
-      - name: Run tests Retry (1/3)
+      # Retries re-run ONLY the specs that failed in the previous attempt
+      # (CLI_TEST_ONLY_FAILED), so a flaky spec costs seconds, not ~60 min.
+      - name: Run tests Retry (1/4)
         continue-on-error: true
         id: retry1
         if: steps.retry0.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       - name: Run tests Retry (2/4)
         continue-on-error: true
         id: retry2
         if: steps.retry1.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       - name: Run tests Retry (3/4)
         continue-on-error: true
         id: retry3
         if: steps.retry2.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
       - name: Run tests Retry (4/4)
         id: retry4
         if: steps.retry3.outcome=='failure'
+        env:
+          CLI_TEST_ONLY_FAILED: '1'
         run: yarn workspace ${{ matrix.package }} test --colors
+      # Keep "green via retry" honest: surface a warning whenever the first
+      # attempt failed and a retry recovered it, so flakiness stays visible.
+      - name: Flag flaky tests
+        if: steps.retry0.outcome=='failure'
+        run: echo "::warning title=Flaky Windows tests::${{ matrix.package }} flaked on the first attempt and was recovered by a spec-level retry (tracked in PER-9011)."
@@ -18,3 +18,6 @@ bstack-ai-harness.yml
 CLAUDE.md
 .claude/
 # bstack-ai-harness:end
+
+# spec-level retry bookkeeping (CI only, see PER-9011)
+.cli-test-failures.json
@@ -125,8 +125,63 @@ async function main({
       }
     }));
 
+    // Spec-level retry support for flaky environments (notably Windows CI,
+    // where ~1 spec out of 1000+ flakes per run on browser/server timing).
+    // When CLI_TEST_FAILURES_FILE is set we record every failed spec name;
+    // a follow-up run with CLI_TEST_ONLY_FAILED=1 re-runs only those specs,
+    // turning a ~60-min full-suite retry into a few seconds. See PER-9011.
+    // NB: these env vars are intentionally NOT prefixed with PERCY_ so they do
+    // not leak into env-audit / Percy-env detection tests (cli-doctor).
+    let failuresFile = process.env.CLI_TEST_FAILURES_FILE;
+    let onlyFailed = process.env.CLI_TEST_ONLY_FAILED === '1';
+    let priorFailures = [];
+
+    if (onlyFailed && failuresFile && fs.existsSync(failuresFile)) {
+      try { priorFailures = JSON.parse(fs.readFileSync(failuresFile, 'utf8')); } catch { /* treated as no recorded failures below */ }
+    }
+
+    // A retry was requested but the previous run recorded no failed specs. That
+    // means the failure was not a flaky spec (e.g. a coverage-threshold failure
+    // or a crash) -- re-running a subset would mask it, so preserve the failure.
+    if (onlyFailed && !priorFailures.length) {
+      console.log(colors.yellow('No recorded spec failures to retry — preserving the previous failure.'));
+      process.exit(1);
+    }
+
+    if (onlyFailed) {
+      let retrySet = new Set(priorFailures);
+      jasmine.env.configure({ specFilter: spec => retrySet.has(spec.getFullName()) });
+      console.log(colors.yellow(`Re-running ${priorFailures.length} previously-failed spec(s)...\n`));
+    }
+
+    let failures = [];
+    jasmine.addReporter({
+      specDone: result => result.status === 'failed' && failures.push(result.fullName)
+    });
+
     console.log(colors.magenta('Running node tests...\n'));
-    await jasmine.execute();
+
+    // manage the exit code ourselves so failures can be persisted first
+    jasmine.exitOnCompletion = false;
+    let result;
+
+    try {
+      result = await jasmine.execute();
+    } finally {
+      if (failuresFile) {
+        try { fs.writeFileSync(failuresFile, JSON.stringify(failures)); } catch { /* best-effort */ }
+      }
+    }
+
+    // When re-running only previously-failed specs, jasmine reports the run as
+    // "incomplete" (the rest are intentionally skipped) — so success here means
+    // the targeted specs passed, i.e. no new failures. A normal full run keeps
+    // jasmine's strict status (incomplete/failed both count as failure).
+    let passed = onlyFailed
+      ? failures.length === 0
+      : (result ? result.overallStatus === 'passed' : failures.length === 0);
+
+    process.exit(passed ? 0 : 1);
   } else if (testBrowsers) {
     // $ karma start --config <root>/karma.config.js
     let { default: Karma } = await import('karma');