Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
0120960
feat: 2pc WAL record format and config
willothy Apr 22, 2026
1ff2d99
feat: 2pc WAL segment files
willothy Apr 23, 2026
d8cb1c7
feat: 2pc WAL writer task with group commit
willothy Apr 23, 2026
a023025
feat: 2pc WAL recovery and Wal::open
willothy Apr 24, 2026
9ae0914
style: rustfmt
willothy Apr 24, 2026
402c5a4
docs: note distinct-dir requirement and TODO flock on Wal::open
willothy Apr 24, 2026
e8f6572
feat: implement 2pc WAL recovery with corruption handling
willothy Apr 24, 2026
7871461
style: format
willothy Apr 24, 2026
475caa8
feat: restore decided 2pc transactions even when recovery hits corrup…
willothy Apr 24, 2026
a958c7c
feat: write 2pc transitions to the WAL
willothy Apr 24, 2026
5bff3fb
feat: enable 2pc WAL on startup when two_phase_commit is set
willothy Apr 24, 2026
48400e0
refactor: read 2pc WAL directory from config instead of plumbing it t…
willothy Apr 24, 2026
614f537
feat: 2pc WAL checkpoints with periodic GC
willothy Apr 24, 2026
e041801
refactor: rename Manager::start to Manager::enable_wal
willothy Apr 24, 2026
d552694
feat: flock the 2pc WAL directory at open
willothy Apr 24, 2026
d1c03da
fix: refuse 2pc operations when the WAL append fails
willothy Apr 26, 2026
990d9f1
fix: rewind 2pc WAL segment when a batch fails to write or sync
willothy Apr 26, 2026
438ec28
docs: clean up stale 2pc comments
willothy Apr 26, 2026
d25a467
refactor: drop 2pc WAL probe step in favor of lockfile
willothy Apr 26, 2026
63b69ea
fix: json schema
willothy Apr 26, 2026
19f880f
style: format
willothy Apr 26, 2026
04c439b
test: 2pc crash recovery integration test
willothy Apr 27, 2026
46cb21b
feat: expose two_pc_recovered_total stat in OpenMetrics
willothy Apr 27, 2026
b90dff3
test: extend 2pc crash safety with clean-shutdown and flock specs, fi…
willothy Apr 27, 2026
699d361
test: cover 2pc commit-path recovery via synthetic WAL helper
willothy Apr 27, 2026
9867edd
style: format
willothy Apr 27, 2026
2495f24
docs: clean up and update 2pc comments
willothy Apr 27, 2026
a6365fd
refactor: drop unused 2pc segment accessors and path fields
willothy Apr 27, 2026
baaca29
docs: correct 2pc wal fsync_interval=0 behavior
willothy Apr 27, 2026
bf98401
test: clean up 2pc crash safety test comments
willothy Apr 27, 2026
c7faff0
fix: 2pc checkpoint interval honors documented seconds unit
willothy Apr 27, 2026
afe8f9f
docs: 2pc wal dir uses flock, not a probe
willothy Apr 27, 2026
39de6b2
feat: make 2pc wal optional, disabled by default
levkk Apr 27, 2026
8ad7d0b
schema
levkk Apr 27, 2026
c95ad2c
try to fix plugin integration test
levkk Apr 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,14 @@ jobs:
- name: Compute cache key
id: cache-key
run: echo "key=pgdog-bin-${{ runner.os }}-$(bash integration/ci/cache-key.sh)" >> "$GITHUB_OUTPUT"
# rust-cache must run before the binary restore: it lays down a
# stale target/ that can otherwise wipe target/release/pgdog when
# cargo reconciles fingerprints during plugin builds.
- name: Restore Rust cache for plugin builds
if: matrix.needs_rust_cache
uses: useblacksmith/rust-cache@v3
with:
prefix-key: build-v1
- name: Restore pgdog binaries
uses: useblacksmith/cache/restore@v5
with:
Expand All @@ -84,11 +92,6 @@ jobs:
target/release/pgdog
key: ${{ steps.cache-key.outputs.key }}
fail-on-cache-miss: true
- name: Restore Rust cache for plugin builds
if: matrix.needs_rust_cache
uses: useblacksmith/rust-cache@v3
with:
prefix-key: build-v1
- name: Setup dependencies
run: bash integration/ci/setup.sh --with-toxi
- name: Run ${{ matrix.name }}
Expand Down
33 changes: 33 additions & 0 deletions .schema/pgdog.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@
"tls_verify": "prefer",
"two_phase_commit": false,
"two_phase_commit_auto": null,
"two_phase_commit_wal_checkpoint_interval": 60,
"two_phase_commit_wal_dir": null,
"two_phase_commit_wal_fsync_interval": 2,
"two_phase_commit_wal_segment_size": 16777216,
"unique_id_function": "standard",
"unique_id_min": 0,
"workers": 2
Expand Down Expand Up @@ -1074,6 +1078,35 @@
],
"default": null
},
"two_phase_commit_wal_checkpoint_interval": {
"description": "How often, in seconds, to write a checkpoint record to the two-phase commit WAL and garbage-collect old segments.\n\n_Default:_ `60`\n\nhttps://docs.pgdog.dev/configuration/pgdog.toml/general/#two_phase_commit_wal_checkpoint_interval",
"type": "integer",
"format": "uint64",
"default": 60,
"minimum": 0
},
"two_phase_commit_wal_dir": {
"description": "Directory where the two-phase commit write-ahead log is stored.\n\n**Note:** This setting cannot be changed at runtime. PgDog acquires an exclusive `flock` on `<dir>/.lock` at startup. If the directory cannot be created or written to, or another PgDog process already holds the lock, the WAL is disabled and a warning is logged: 2PC will continue to function but will not be durable across restarts.\n\n_Default:_ `./pgdog_wal`\n\nhttps://docs.pgdog.dev/configuration/pgdog.toml/general/#two_phase_commit_wal_dir",
"type": [
"string",
"null"
],
"default": null
},
"two_phase_commit_wal_fsync_interval": {
"description": "How long, in milliseconds, the two-phase commit WAL writer waits to coalesce concurrent appends into a single fsync.\n\n**Note:** Setting this to `0` disables waiting for additional appends; records already queued in the channel when the writer wakes are still batched into one fsync. Higher values trade per-transaction commit latency for fewer fsyncs under load.\n\n_Default:_ `2`\n\nhttps://docs.pgdog.dev/configuration/pgdog.toml/general/#two_phase_commit_wal_fsync_interval",
"type": "integer",
"format": "uint64",
"default": 2,
"minimum": 0
},
"two_phase_commit_wal_segment_size": {
"description": "Maximum size, in bytes, of a single two-phase commit WAL segment file before it is rotated.\n\n_Default:_ `16777216` (16 MiB)\n\nhttps://docs.pgdog.dev/configuration/pgdog.toml/general/#two_phase_commit_wal_segment_size",
"type": "integer",
"format": "uint64",
"default": 16777216,
"minimum": 0
},
"unique_id_function": {
"description": "Unique ID generation function.",
"$ref": "#/$defs/UniqueIdFunction",
Expand Down
19 changes: 19 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ exclude = ["fuzz"]
members = [
"examples/demo",
"integration/rust",
"integration/two_pc_crash_safety/wal_helper",
"pgdog",
"pgdog-config",
"pgdog-macros",
Expand Down
6 changes: 6 additions & 0 deletions integration/two_pc_crash_safety/Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# frozen_string_literal: true

source 'https://rubygems.org'
gem 'pg'
gem 'rspec', '~> 3.4'
gem 'toxiproxy'
30 changes: 30 additions & 0 deletions integration/two_pc_crash_safety/Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
GEM
remote: https://rubygems.org/
specs:
diff-lcs (1.6.2)
pg (1.5.9)
rspec (3.13.2)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
rspec-mocks (~> 3.13.0)
rspec-core (3.13.6)
rspec-support (~> 3.13.0)
rspec-expectations (3.13.5)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-mocks (3.13.8)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.7)
toxiproxy (2.0.2)

PLATFORMS
ruby

DEPENDENCIES
pg
rspec (~> 3.4)
toxiproxy

BUNDLED WITH
1.17.2
172 changes: 172 additions & 0 deletions integration/two_pc_crash_safety/crash_recovery_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# frozen_string_literal: true

require_relative 'rspec_helper'

CONFIG_DIR = __dir__

describe '2pc crash safety' do
let(:wal_dir) { @wal_dir }

before(:each) do
cleanup_prepared_xacts!
@pid, @wal_dir = spawn_pgdog(CONFIG_DIR)
wait_for_pgdog
end

after(:each) do
stop_pgdog(@pid)
FileUtils.rm_rf(@wal_dir) if @wal_dir
end

it 'rolls back orphan prepared xacts after a kill mid-PREPARE' do
client = pgdog_conn
client.exec('BEGIN')
client.exec("INSERT INTO crash_safety_test (id, value) VALUES (0, 'a')")
client.exec("INSERT INTO crash_safety_test (id, value) VALUES (1, 'b')")

# Hold shard_1's responses so PREPARE TRANSACTION on shard_1 hangs
# waiting for the reply. Latency keeps the connection open so pgdog
# stays mid-2PC.
Toxiproxy[:crash_safety_shard_1].toxic(:latency, latency: 30_000).apply do
Thread.new do
begin
client.exec('COMMIT')
rescue PG::Error
# connection drops when we kill pgdog.
end
end
# Wait for pgdog to issue PREPARE on shard_0 and start blocking
# on shard_1's response before we kill it.
sleep 1.0
Process.kill('KILL', @pid)
Process.waitpid(@pid)
@pid = nil
end

# At least one shard must have an orphan prepared xact left behind,
# otherwise recovery has nothing to clean up.
leftover = SHARDS.flat_map { |s| prepared_xacts(s) }
expect(leftover).not_to be_empty,
'no prepared xacts after kill; the kill landed before any PREPARE made it through'

# Restart pgdog with the same WAL directory. Recovery sees the
# Begin record and drives ROLLBACK PREPARED on every participant.
@pid, _ = spawn_pgdog(CONFIG_DIR, wal_dir: @wal_dir)
wait_for_pgdog

expect(metric('two_pc_recovered_total')).to be > 0,
'restarted pgdog reports zero recovered txns; recovery did not run'

expect(wait_for_no_prepared_xacts).to be(true),
lambda {
leftover = SHARDS.flat_map { |s| prepared_xacts(s) }
"prepared xacts did not drain: #{leftover.inspect}"
}

SHARDS.each do |shard|
c = shard_conn(shard)
count = c.exec('SELECT COUNT(*) FROM crash_safety_test').to_a[0]['count'].to_i
c.close
expect(count).to eq(0), "expected no rows on #{shard[:db]}, found #{count}"
end
end

it 'commits orphan prepared xacts after a kill mid-COMMIT-PREPARED' do
# Synthesize the on-disk state pgdog would leave behind if it
# crashed mid-Phase2, then start pgdog and verify it commits the
# orphans. Catching pgdog reliably between "Committing record
# durable" and "all COMMIT PREPAREDs done" from outside isn't
# practical: toxiproxy is byte-level so any delay on shard_1 also
# blocks PREPARE, and Postgres has no hook in COMMIT PREPARED.
stop_pgdog(@pid)
@pid = nil
FileUtils.rm_rf(@wal_dir)
Dir.mkdir(@wal_dir)

gid = "__pgdog_2pc_#{rand(1..(1 << 62))}"
synthesize_phase2_wal(@wal_dir, gid, 'pgdog', 'pgdog')

# PgDog suffixes the gid with `_<shard_idx>` per shard so the
# names don't collide on a single cluster (pgdog/src/backend/pool/
# connection/binding.rs). Mirror that scheme here.
SHARDS.each_with_index do |shard, idx|
c = shard_conn(shard)
c.exec('BEGIN')
c.exec("INSERT INTO crash_safety_test (id, value) VALUES (#{idx}, 'r')")
c.exec("PREPARE TRANSACTION '#{gid}_#{idx}'")
c.close
end

@pid, _ = spawn_pgdog(CONFIG_DIR, wal_dir: @wal_dir)
wait_for_pgdog

expect(metric('two_pc_recovered_total')).to be > 0,
'restarted pgdog reports zero recovered txns; recovery did not run'

expect(wait_for_no_prepared_xacts).to be(true),
lambda {
leftover = SHARDS.flat_map { |s| prepared_xacts(s) }
"prepared xacts did not drain: #{leftover.inspect}"
}

SHARDS.each do |shard|
c = shard_conn(shard)
count = c.exec('SELECT COUNT(*) FROM crash_safety_test').to_a[0]['count'].to_i
c.close
expect(count).to eq(1), "expected 1 row on #{shard[:db]}, found #{count}"
end
end

it 'no recovery work after a clean shutdown' do
client = pgdog_conn
client.exec('BEGIN')
client.exec("INSERT INTO crash_safety_test (id, value) VALUES (0, 'a')")
client.exec("INSERT INTO crash_safety_test (id, value) VALUES (1, 'b')")
client.exec('COMMIT')
client.close

# SIGTERM lets Manager::shutdown drain and wal.append_end every
# finished txn. The WAL on disk should describe a complete cycle
# (Begin, Committing, End) for every txn so recovery has nothing
# to do on the next start.
stop_pgdog(@pid, signal: 'TERM')
@pid = nil

@pid, _ = spawn_pgdog(CONFIG_DIR, wal_dir: @wal_dir)
wait_for_pgdog

expect(metric('two_pc_recovered_total')).to eq(0),
'clean-shutdown WAL should have left no in-flight txns to recover'
end

it 'second pgdog refuses to share the WAL directory' do
# First pgdog (started in before) holds the flock on @wal_dir/.lock.
# Spawning a second one against the same dir should fail to acquire
# the lock; enable_wal warns and continues without WAL durability,
# so we verify the failure via the log line.
log_path = File.join(CONFIG_DIR, 'pgdog-second.log')
second_pid = Process.spawn(
{
'PGDOG_TWO_PHASE_COMMIT_WAL_DIR' => @wal_dir,
'PGDOG_PORT' => '6433',
},
ENV['PGDOG_BIN'] || File.expand_path('../../target/release/pgdog', __dir__),
'--config', File.join(CONFIG_DIR, 'pgdog.toml'),
'--users', File.join(CONFIG_DIR, 'users.toml'),
out: log_path, err: %i[child out]
)

begin
# Give the second pgdog enough time to attempt enable_wal and log
# the failure.
sleep 2
stop_pgdog(second_pid)

log = File.read(log_path)
expect(log).to include('locked by another process'),
"expected DirLocked error in second pgdog log; got:\n#{log}"
ensure
FileUtils.rm_f(log_path)
end
end
end
40 changes: 40 additions & 0 deletions integration/two_pc_crash_safety/pgdog.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
[general]
two_phase_commit = true
openmetrics_port = 9090

[[databases]]
name = "pgdog"
host = "127.0.0.1"
shard = 0
database_name = "shard_0"

# shard_1 is reached via the toxiproxy listener so the spec can hold
# requests in flight (timeout toxic) and SIGKILL pgdog mid-2PC.
[[databases]]
name = "pgdog"
host = "127.0.0.1"
port = 5435
shard = 1
database_name = "shard_1"

[[sharded_tables]]
database = "pgdog"
column = "id"
data_type = "bigint"

[[sharded_mappings]]
database = "pgdog"
column = "id"
kind = "list"
values = [0]
shard = 0

[[sharded_mappings]]
database = "pgdog"
column = "id"
kind = "list"
values = [1]
shard = 1

[admin]
password = "pgdog"
Loading
Loading