pgdogdev · levkk · Apr 27, 2026 · Apr 22, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -76,6 +76,14 @@ jobs:
       - name: Compute cache key
         id: cache-key
         run: echo "key=pgdog-bin-${{ runner.os }}-$(bash integration/ci/cache-key.sh)" >> "$GITHUB_OUTPUT"
+      # rust-cache must run before the binary restore: it lays down a
+      # stale target/ that can otherwise wipe target/release/pgdog when
+      # cargo reconciles fingerprints during plugin builds.
+      - name: Restore Rust cache for plugin builds
+        if: matrix.needs_rust_cache
+        uses: useblacksmith/rust-cache@v3
+        with:
+          prefix-key: build-v1
       - name: Restore pgdog binaries
         uses: useblacksmith/cache/restore@v5
         with:
@@ -84,11 +92,6 @@ jobs:
             target/release/pgdog
           key: ${{ steps.cache-key.outputs.key }}
           fail-on-cache-miss: true
-      - name: Restore Rust cache for plugin builds
-        if: matrix.needs_rust_cache
-        uses: useblacksmith/rust-cache@v3
-        with:
-          prefix-key: build-v1
       - name: Setup dependencies
         run: bash integration/ci/setup.sh --with-toxi
       - name: Run ${{ matrix.name }}

diff --git a/.schema/pgdog.schema.json b/.schema/pgdog.schema.json
@@ -108,6 +108,10 @@
         "tls_verify": "prefer",
         "two_phase_commit": false,
         "two_phase_commit_auto": null,
+        "two_phase_commit_wal_checkpoint_interval": 60,
+        "two_phase_commit_wal_dir": null,
+        "two_phase_commit_wal_fsync_interval": 2,
+        "two_phase_commit_wal_segment_size": 16777216,
         "unique_id_function": "standard",
         "unique_id_min": 0,
         "workers": 2
@@ -1074,6 +1078,35 @@
           ],
           "default": null
         },
+        "two_phase_commit_wal_checkpoint_interval": {
+          "description": "How often, in seconds, to write a checkpoint record to the two-phase commit WAL and garbage-collect old segments.\n\n_Default:_ `60`\n\nhttps://docs.pgdog.dev/configuration/pgdog.toml/general/#two_phase_commit_wal_checkpoint_interval",
+          "type": "integer",
+          "format": "uint64",
+          "default": 60,
+          "minimum": 0
+        },
+        "two_phase_commit_wal_dir": {
+          "description": "Directory where the two-phase commit write-ahead log is stored.\n\n**Note:** This setting cannot be changed at runtime. PgDog acquires an exclusive `flock` on `<dir>/.lock` at startup. If the directory cannot be created or written to, or another PgDog process already holds the lock, the WAL is disabled and a warning is logged: 2PC will continue to function but will not be durable across restarts.\n\n_Default:_ `./pgdog_wal`\n\nhttps://docs.pgdog.dev/configuration/pgdog.toml/general/#two_phase_commit_wal_dir",
+          "type": [
+            "string",
+            "null"
+          ],
+          "default": null
+        },
+        "two_phase_commit_wal_fsync_interval": {
+          "description": "How long, in milliseconds, the two-phase commit WAL writer waits to coalesce concurrent appends into a single fsync.\n\n**Note:** Setting this to `0` disables waiting for additional appends; records already queued in the channel when the writer wakes are still batched into one fsync. Higher values trade per-transaction commit latency for fewer fsyncs under load.\n\n_Default:_ `2`\n\nhttps://docs.pgdog.dev/configuration/pgdog.toml/general/#two_phase_commit_wal_fsync_interval",
+          "type": "integer",
+          "format": "uint64",
+          "default": 2,
+          "minimum": 0
+        },
+        "two_phase_commit_wal_segment_size": {
+          "description": "Maximum size, in bytes, of a single two-phase commit WAL segment file before it is rotated.\n\n_Default:_ `16777216` (16 MiB)\n\nhttps://docs.pgdog.dev/configuration/pgdog.toml/general/#two_phase_commit_wal_segment_size",
+          "type": "integer",
+          "format": "uint64",
+          "default": 16777216,
+          "minimum": 0
+        },
         "unique_id_function": {
           "description": "Unique ID generation function.",
           "$ref": "#/$defs/UniqueIdFunction",

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,6 +4,7 @@ exclude = ["fuzz"]
 members = [
   "examples/demo",
   "integration/rust",
+  "integration/two_pc_crash_safety/wal_helper",
   "pgdog",
   "pgdog-config",
   "pgdog-macros",

diff --git a/integration/two_pc_crash_safety/Gemfile b/integration/two_pc_crash_safety/Gemfile
@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+
+source 'https://rubygems.org'
+gem 'pg'
+gem 'rspec', '~> 3.4'
+gem 'toxiproxy'
diff --git a/integration/two_pc_crash_safety/Gemfile.lock b/integration/two_pc_crash_safety/Gemfile.lock
@@ -0,0 +1,30 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    diff-lcs (1.6.2)
+    pg (1.5.9)
+    rspec (3.13.2)
+      rspec-core (~> 3.13.0)
+      rspec-expectations (~> 3.13.0)
+      rspec-mocks (~> 3.13.0)
+    rspec-core (3.13.6)
+      rspec-support (~> 3.13.0)
+    rspec-expectations (3.13.5)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-mocks (3.13.8)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-support (3.13.7)
+    toxiproxy (2.0.2)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  pg
+  rspec (~> 3.4)
+  toxiproxy
+
+BUNDLED WITH
+   1.17.2
diff --git a/integration/two_pc_crash_safety/crash_recovery_spec.rb b/integration/two_pc_crash_safety/crash_recovery_spec.rb
@@ -0,0 +1,172 @@
+# frozen_string_literal: true
+
+require_relative 'rspec_helper'
+
+CONFIG_DIR = __dir__
+
+describe '2pc crash safety' do
+  let(:wal_dir) { @wal_dir }
+
+  before(:each) do
+    cleanup_prepared_xacts!
+    @pid, @wal_dir = spawn_pgdog(CONFIG_DIR)
+    wait_for_pgdog
+  end
+
+  after(:each) do
+    stop_pgdog(@pid)
+    FileUtils.rm_rf(@wal_dir) if @wal_dir
+  end
+
+  it 'rolls back orphan prepared xacts after a kill mid-PREPARE' do
+    client = pgdog_conn
+    client.exec('BEGIN')
+    client.exec("INSERT INTO crash_safety_test (id, value) VALUES (0, 'a')")
+    client.exec("INSERT INTO crash_safety_test (id, value) VALUES (1, 'b')")
+
+    # Hold shard_1's responses so PREPARE TRANSACTION on shard_1 hangs
+    # waiting for the reply. Latency keeps the connection open so pgdog
+    # stays mid-2PC.
+    Toxiproxy[:crash_safety_shard_1].toxic(:latency, latency: 30_000).apply do
+      Thread.new do
+        begin
+          client.exec('COMMIT')
+        rescue PG::Error
+          # connection drops when we kill pgdog.
+        end
+      end
+      # Wait for pgdog to issue PREPARE on shard_0 and start blocking
+      # on shard_1's response before we kill it.
+      sleep 1.0
+      Process.kill('KILL', @pid)
+      Process.waitpid(@pid)
+      @pid = nil
+    end
+
+    # At least one shard must have an orphan prepared xact left behind,
+    # otherwise recovery has nothing to clean up.
+    leftover = SHARDS.flat_map { |s| prepared_xacts(s) }
+    expect(leftover).not_to be_empty,
+      'no prepared xacts after kill; the kill landed before any PREPARE made it through'
+
+    # Restart pgdog with the same WAL directory. Recovery sees the
+    # Begin record and drives ROLLBACK PREPARED on every participant.
+    @pid, _ = spawn_pgdog(CONFIG_DIR, wal_dir: @wal_dir)
+    wait_for_pgdog
+
+    expect(metric('two_pc_recovered_total')).to be > 0,
+      'restarted pgdog reports zero recovered txns; recovery did not run'
+
+    expect(wait_for_no_prepared_xacts).to be(true),
+      lambda {
+        leftover = SHARDS.flat_map { |s| prepared_xacts(s) }
+        "prepared xacts did not drain: #{leftover.inspect}"
+      }
+
+    SHARDS.each do |shard|
+      c = shard_conn(shard)
+      count = c.exec('SELECT COUNT(*) FROM crash_safety_test').to_a[0]['count'].to_i
+      c.close
+      expect(count).to eq(0), "expected no rows on #{shard[:db]}, found #{count}"
+    end
+  end
+
+  it 'commits orphan prepared xacts after a kill mid-COMMIT-PREPARED' do
+    # Synthesize the on-disk state pgdog would leave behind if it
+    # crashed mid-Phase2, then start pgdog and verify it commits the
+    # orphans. Catching pgdog reliably between "Committing record
+    # durable" and "all COMMIT PREPAREDs done" from outside isn't
+    # practical: toxiproxy is byte-level so any delay on shard_1 also
+    # blocks PREPARE, and Postgres has no hook in COMMIT PREPARED.
+    stop_pgdog(@pid)
+    @pid = nil
+    FileUtils.rm_rf(@wal_dir)
+    Dir.mkdir(@wal_dir)
+
+    gid = "__pgdog_2pc_#{rand(1..(1 << 62))}"
+    synthesize_phase2_wal(@wal_dir, gid, 'pgdog', 'pgdog')
+
+    # PgDog suffixes the gid with `_<shard_idx>` per shard so the
+    # names don't collide on a single cluster (pgdog/src/backend/pool/
+    # connection/binding.rs). Mirror that scheme here.
+    SHARDS.each_with_index do |shard, idx|
+      c = shard_conn(shard)
+      c.exec('BEGIN')
+      c.exec("INSERT INTO crash_safety_test (id, value) VALUES (#{idx}, 'r')")
+      c.exec("PREPARE TRANSACTION '#{gid}_#{idx}'")
+      c.close
+    end
+
+    @pid, _ = spawn_pgdog(CONFIG_DIR, wal_dir: @wal_dir)
+    wait_for_pgdog
+
+    expect(metric('two_pc_recovered_total')).to be > 0,
+      'restarted pgdog reports zero recovered txns; recovery did not run'
+
+    expect(wait_for_no_prepared_xacts).to be(true),
+      lambda {
+        leftover = SHARDS.flat_map { |s| prepared_xacts(s) }
+        "prepared xacts did not drain: #{leftover.inspect}"
+      }
+
+    SHARDS.each do |shard|
+      c = shard_conn(shard)
+      count = c.exec('SELECT COUNT(*) FROM crash_safety_test').to_a[0]['count'].to_i
+      c.close
+      expect(count).to eq(1), "expected 1 row on #{shard[:db]}, found #{count}"
+    end
+  end
+
+  it 'no recovery work after a clean shutdown' do
+    client = pgdog_conn
+    client.exec('BEGIN')
+    client.exec("INSERT INTO crash_safety_test (id, value) VALUES (0, 'a')")
+    client.exec("INSERT INTO crash_safety_test (id, value) VALUES (1, 'b')")
+    client.exec('COMMIT')
+    client.close
+
+    # SIGTERM lets Manager::shutdown drain and wal.append_end every
+    # finished txn. The WAL on disk should describe a complete cycle
+    # (Begin, Committing, End) for every txn so recovery has nothing
+    # to do on the next start.
+    stop_pgdog(@pid, signal: 'TERM')
+    @pid = nil
+
+    @pid, _ = spawn_pgdog(CONFIG_DIR, wal_dir: @wal_dir)
+    wait_for_pgdog
+
+    expect(metric('two_pc_recovered_total')).to eq(0),
+      'clean-shutdown WAL should have left no in-flight txns to recover'
+  end
+
+  it 'second pgdog refuses to share the WAL directory' do
+    # First pgdog (started in before) holds the flock on @wal_dir/.lock.
+    # Spawning a second one against the same dir should fail to acquire
+    # the lock; enable_wal warns and continues without WAL durability,
+    # so we verify the failure via the log line.
+    log_path = File.join(CONFIG_DIR, 'pgdog-second.log')
+    second_pid = Process.spawn(
+      {
+        'PGDOG_TWO_PHASE_COMMIT_WAL_DIR' => @wal_dir,
+        'PGDOG_PORT' => '6433',
+      },
+      ENV['PGDOG_BIN'] || File.expand_path('../../target/release/pgdog', __dir__),
+      '--config', File.join(CONFIG_DIR, 'pgdog.toml'),
+      '--users', File.join(CONFIG_DIR, 'users.toml'),
+      out: log_path, err: %i[child out]
+    )
+
+    begin
+      # Give the second pgdog enough time to attempt enable_wal and log
+      # the failure.
+      sleep 2
+      stop_pgdog(second_pid)
+
+      log = File.read(log_path)
+      expect(log).to include('locked by another process'),
+        "expected DirLocked error in second pgdog log; got:\n#{log}"
+    ensure
+      FileUtils.rm_f(log_path)
+    end
+  end
+end
diff --git a/integration/two_pc_crash_safety/pgdog.toml b/integration/two_pc_crash_safety/pgdog.toml
@@ -0,0 +1,40 @@
+[general]
+two_phase_commit = true
+openmetrics_port = 9090
+
+[[databases]]
+name = "pgdog"
+host = "127.0.0.1"
+shard = 0
+database_name = "shard_0"
+
+# shard_1 is reached via the toxiproxy listener so the spec can hold
+# requests in flight (timeout toxic) and SIGKILL pgdog mid-2PC.
+[[databases]]
+name = "pgdog"
+host = "127.0.0.1"
+port = 5435
+shard = 1
+database_name = "shard_1"
+
+[[sharded_tables]]
+database = "pgdog"
+column = "id"
+data_type = "bigint"
+
+[[sharded_mappings]]
+database = "pgdog"
+column = "id"
+kind = "list"
+values = [0]
+shard = 0
+
+[[sharded_mappings]]
+database = "pgdog"
+column = "id"
+kind = "list"
+values = [1]
+shard = 1
+
+[admin]
+password = "pgdog"