Skip to content

Commit 7c57527

Browse files
folbrichtAnuskuss
andauthored
Failback immediately if reset-after is set to 0 and new flag to fail on empty responses (#448)
* Failback immediately * Add `empty-error` for detecting unusual empty responses * Treat all empty responses as errors * Multiple CNAMES + No records * folbricht's suggestions * Clarify `0` seconds * Suggestion for single-request failover * update * Check for EDE when deciding on 'empty-error' * fix failback --------- Co-authored-by: Anuskuss <[email protected]>
1 parent c706736 commit 7c57527

File tree

7 files changed

+137
-19
lines changed

7 files changed

+137
-19
lines changed

cmd/routedns/config.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,9 @@ type group struct {
109109
EDNS0Data []byte `toml:"edns0-data"` // EDNS0 modifier option data
110110

111111
// Failover/Failback options
112-
ResetAfter int `toml:"reset-after"` // Time in seconds after which to reset resolvers in fail-back and random groups, default 60.
112+
ResetAfter *int `toml:"reset-after"` // Time in seconds after which to reset resolvers in fail-back and random groups, or 0 to reset immediately, default 60.
113113
ServfailError bool `toml:"servfail-error"` // If true, SERVFAIL responses are considered errors and cause failover etc.
114+
EmptyError bool `toml:"empty-error"` // If true, empty responses are considered errors and cause failover etc.
114115

115116
// Cache options
116117
Backend *cacheBackend

cmd/routedns/main.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -370,20 +370,31 @@ func instantiateGroup(id string, g group, resolvers map[string]rdns.Resolver) er
370370
case "fail-rotate":
371371
opt := rdns.FailRotateOptions{
372372
ServfailError: g.ServfailError,
373+
EmptyError: g.EmptyError,
373374
}
374375
resolvers[id] = rdns.NewFailRotate(id, opt, gr...)
375376
case "fail-back":
377+
var resetAfterDefault int = 60
378+
if g.ResetAfter == nil {
379+
g.ResetAfter = &resetAfterDefault
380+
}
376381
opt := rdns.FailBackOptions{
377-
ResetAfter: time.Duration(time.Duration(g.ResetAfter) * time.Second),
382+
ResetAfter: time.Duration(time.Duration(*g.ResetAfter) * time.Second),
378383
ServfailError: g.ServfailError,
384+
EmptyError: g.EmptyError,
379385
}
380386
resolvers[id] = rdns.NewFailBack(id, opt, gr...)
381387
case "fastest":
382388
resolvers[id] = rdns.NewFastest(id, gr...)
383389
case "random":
390+
var resetAfterDefault int = 60
391+
if g.ResetAfter == nil {
392+
g.ResetAfter = &resetAfterDefault
393+
}
384394
opt := rdns.RandomOptions{
385-
ResetAfter: time.Duration(time.Duration(g.ResetAfter) * time.Second),
395+
ResetAfter: time.Duration(time.Duration(*g.ResetAfter) * time.Second),
386396
ServfailError: g.ServfailError,
397+
EmptyError: g.EmptyError,
387398
}
388399
resolvers[id] = rdns.NewRandom(id, opt, gr...)
389400
case "blocklist":

doc/configuration.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,7 @@ Options:
519519

520520
- `resolvers` - An array of upstream resolvers or modifiers.
521521
- `servfail-error` - If `true`, a SERVFAIL response from an upstream resolver is considered a failure triggering a switch to the next resolver. This can happen when DNSSEC validation fails for example. Default `false`.
522+
- `empty-error` - If `true`, an empty reponse from an upstream resolver is considered a failure triggering a switch to the next resolver. Default `false`.
522523

523524
#### Examples
524525

@@ -539,8 +540,9 @@ Fail-Back groups are instantiated with `type = "fail-back"` in the groups sectio
539540
Options:
540541

541542
- `resolvers` - An array of upstream resolvers or modifiers. The first in the array is the preferred resolver.
542-
- `reset-after` - Time in seconds before switching from an alternative resolver back to the preferred resolver (first in the list), default 60. Note: This is not a timeout argument. After a failure of the preferred resolver, this defines the amount of time to use alternative/failover resolvers before switching back to the preferred. You can have as many resolvers in the array as the time limit allows.
543+
- `reset-after` - Non-zero time in seconds before switching from an alternative resolver back to the preferred resolver (first in the list), or a negative number to switch back immediately, default 60. Note: This is not a timeout argument. After a failure of the preferred resolver, this defines the amount of time to use alternative/failover resolvers before switching back to the preferred. You can have as many resolvers in the array as the time limit allows.
543544
- `servfail-error` - If `true`, a SERVFAIL response from an upstream resolver is considered a failure triggering a failover. This can happen when DNSSEC validation fails for example. Default `false`.
545+
- `empty-error` - If `true`, an empty reponse from an upstream resolver is considered a failure triggering a switch to the next resolver. Default `false`.
544546

545547
#### Examples
546548

@@ -561,8 +563,9 @@ Random groups are instantiated with `type = "random"` in the groups section of t
561563
Options:
562564

563565
- `resolvers` - An array of upstream resolvers or modifiers.
564-
- `reset-after` - Time in seconds to disable a failed resolver, default 60.
566+
- `reset-after` - Non-zero time in seconds to disable a failed resolver, or a negative number to disable only for a single request, default 60.
565567
- `servfail-error` - If `true`, a SERVFAIL response from an upstream resolver is considered a failure which will take the resolver temporarily out of the group. This can happen when DNSSEC validation fails for example. Default `false`.
568+
- `empty-error` - If `true`, an empty reponse from an upstream resolver is considered a failure triggering a switch to the next resolver. Default `false`.
566569

567570
#### Examples
568571

failback.go

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ type FailBackOptions struct {
3636
// Determines if a SERVFAIL returned by a resolver should be considered an
3737
// error response and trigger a failover.
3838
ServfailError bool
39+
40+
// Determines if an empty reponse returned by a resolver should be considered an
41+
// error respone and trigger a failover.
42+
EmptyError bool
3943
}
4044

4145
var _ Resolver = &FailBack{}
@@ -61,9 +65,6 @@ func NewFailRouterMetrics(id string, available int) *FailRouterMetrics {
6165

6266
// NewFailBack returns a new instance of a failover resolver group.
6367
func NewFailBack(id string, opt FailBackOptions, resolvers ...Resolver) *FailBack {
64-
if opt.ResetAfter == 0 {
65-
opt.ResetAfter = time.Minute
66-
}
6768
return &FailBack{
6869
id: id,
6970
resolvers: resolvers,
@@ -81,7 +82,7 @@ func (r *FailBack) Resolve(q *dns.Msg, ci ClientInfo) (*dns.Msg, error) {
8182
a *dns.Msg
8283
)
8384
for i := 0; i < len(r.resolvers); i++ {
84-
resolver, active := r.current()
85+
resolver, active := r.current(i)
8586
log.With("resolver", resolver.String()).Debug("forwarding query to resolver")
8687
r.metrics.route.Add(resolver.String(), 1)
8788
a, err = resolver.Resolve(q, ci)
@@ -102,7 +103,12 @@ func (r *FailBack) String() string {
102103
}
103104

104105
// Thread-safe method to return the currently active resolver.
105-
func (r *FailBack) current() (Resolver, int) {
106+
func (r *FailBack) current(attempt int) (Resolver, int) {
107+
// With ResetAfter set to 0, simply iterate through the list of
108+
// resolvers on a per-request basis.
109+
if r.opt.ResetAfter == 0 {
110+
return r.resolvers[attempt], attempt
111+
}
106112
r.mu.RLock()
107113
defer r.mu.RUnlock()
108114
return r.resolvers[r.active], r.active
@@ -113,6 +119,11 @@ func (r *FailBack) current() (Resolver, int) {
113119
// requests. Another request could have initiated the failover already. So ignore if i is not
114120
// (no longer) the active store.
115121
func (r *FailBack) errorFrom(i int) {
122+
// If ResetAfter is set to 0, we fail-over to the next resolver, but
123+
// only for this single request. It won't affect any other requests.
124+
if r.opt.ResetAfter == 0 {
125+
return
126+
}
116127
r.mu.Lock()
117128
if i != r.active {
118129
r.mu.Unlock()
@@ -158,5 +169,46 @@ func (r *FailBack) startResetTimer() chan struct{} {
158169

159170
// Returns true is the response is considered successful given the options.
160171
func (r *FailBack) isSuccessResponse(a *dns.Msg) bool {
161-
return a == nil || !(r.opt.ServfailError && a.Rcode == dns.RcodeServerFailure)
172+
if a == nil {
173+
return true
174+
}
175+
if r.opt.ServfailError && a.Rcode == dns.RcodeServerFailure {
176+
return false
177+
}
178+
if r.opt.EmptyError {
179+
// Check if there are only CNAMEs in the answer section
180+
var onlyCNAME = true
181+
for _, rr := range a.Answer {
182+
if rr.Header().Rrtype != dns.TypeCNAME {
183+
onlyCNAME = false
184+
break
185+
}
186+
}
187+
// The answer may be blank because it was blocked by a filter.
188+
// If so (as determined by the presence of an EDE option), we
189+
// consider it "successful" as we shouldn't retry or fail-over
190+
// in that case.
191+
if onlyCNAME && !hasExtendedErrorBlocked(a) {
192+
return false
193+
}
194+
}
195+
return true
196+
}
197+
198+
// Returns true if the message contains an extended error option indicating it
199+
// was blocked, censored, or filtered.
200+
func hasExtendedErrorBlocked(msg *dns.Msg) bool {
201+
edns0 := msg.IsEdns0()
202+
if edns0 == nil {
203+
return false
204+
}
205+
for _, opt := range edns0.Option {
206+
if ede, ok := opt.(*dns.EDNS0_EDE); ok {
207+
switch ede.InfoCode {
208+
case dns.ExtendedErrorCodeBlocked, dns.ExtendedErrorCodeCensored, dns.ExtendedErrorCodeFiltered:
209+
return true
210+
}
211+
}
212+
}
213+
return false
162214
}

failback_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ func TestFailBackServfailOKOption(t *testing.T) {
113113
goodResolver := new(TestResolver)
114114

115115
// With ServfailError == false
116-
g1 := NewFailBack("test-fb", FailBackOptions{}, failResolver, goodResolver)
116+
g1 := NewFailBack("test-fb", FailBackOptions{ResetAfter: time.Second}, failResolver, goodResolver)
117117
q := new(dns.Msg)
118118
q.SetQuestion("test.com.", dns.TypeA)
119119

@@ -123,7 +123,7 @@ func TestFailBackServfailOKOption(t *testing.T) {
123123
require.Equal(t, 0, goodResolver.hitCount)
124124

125125
// With ServfailError == true
126-
g2 := NewFailBack("test-fb", FailBackOptions{ServfailError: true}, failResolver, goodResolver)
126+
g2 := NewFailBack("test-fb", FailBackOptions{ResetAfter: time.Second, ServfailError: true}, failResolver, goodResolver)
127127

128128
a, err = g2.Resolve(q, ci)
129129
require.NoError(t, err)

failrotate.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ type FailRotateOptions struct {
2727
// Determines if a SERVFAIL returned by a resolver should be considered an
2828
// error response and trigger a failover.
2929
ServfailError bool
30+
31+
// Determines if an empty reponse returned by a resolver should be considered an
32+
// error respone and trigger a failover.
33+
EmptyError bool
3034
}
3135

3236
var _ Resolver = &FailRotate{}
@@ -94,5 +98,28 @@ func (r *FailRotate) errorFrom(i int) {
9498

9599
// Returns true is the response is considered successful given the options.
96100
func (r *FailRotate) isSuccessResponse(a *dns.Msg) bool {
97-
return a == nil || !(r.opt.ServfailError && a.Rcode == dns.RcodeServerFailure)
101+
if a == nil {
102+
return true
103+
}
104+
if r.opt.ServfailError && a.Rcode == dns.RcodeServerFailure {
105+
return false
106+
}
107+
if r.opt.EmptyError {
108+
// Check if there are only CNAMEs in the answer section
109+
var onlyCNAME = true
110+
for _, rr := range a.Answer {
111+
if rr.Header().Rrtype != dns.TypeCNAME {
112+
onlyCNAME = false
113+
break
114+
}
115+
}
116+
// The answer may be blank because it was blocked by a filter.
117+
// If so (as determined by the presence of an EDE option), we
118+
// consider it "successful" as we shouldn't retry or fail-over
119+
// in that case.
120+
if onlyCNAME && !hasExtendedErrorBlocked(a) {
121+
return false
122+
}
123+
}
124+
return true
98125
}

random.go

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,14 @@ type RandomOptions struct {
3030
// Determines if a SERVFAIL returned by a resolver should be considered an
3131
// error response and cause the resolver to be removed from the group temporarily.
3232
ServfailError bool
33+
34+
// Determines if an empty reponse returned by a resolver should be considered an
35+
// error respone and trigger a failover.
36+
EmptyError bool
3337
}
3438

3539
// NewRandom returns a new instance of a random resolver group.
3640
func NewRandom(id string, opt RandomOptions, resolvers ...Resolver) *Random {
37-
rand.Seed(time.Now().UnixNano())
38-
if opt.ResetAfter == 0 {
39-
opt.ResetAfter = time.Minute
40-
}
4141
return &Random{
4242
id: id,
4343
resolvers: resolvers,
@@ -121,5 +121,29 @@ func (r *Random) reactivateLater(resolver Resolver) {
121121

122122
// Returns true is the response is considered successful given the options.
123123
func (r *Random) isSuccessResponse(a *dns.Msg) bool {
124-
return a == nil || !(r.opt.ServfailError && a.Rcode == dns.RcodeServerFailure)
124+
if a == nil {
125+
return true
126+
}
127+
if r.opt.ServfailError && a.Rcode == dns.RcodeServerFailure {
128+
return false
129+
}
130+
if r.opt.EmptyError {
131+
// Check if there are only CNAMEs in the answer section
132+
var onlyCNAME = true
133+
for _, rr := range a.Answer {
134+
if rr.Header().Rrtype != dns.TypeCNAME {
135+
onlyCNAME = false
136+
break
137+
}
138+
}
139+
// The answer may be blank because it was blocked by a filter.
140+
// If so (as determined by the presence of an EDE option), we
141+
// consider it "successful" as we shouldn't retry or fail-over
142+
// in that case.
143+
if onlyCNAME && !hasExtendedErrorBlocked(a) {
144+
return false
145+
}
146+
}
147+
return true
148+
125149
}

0 commit comments

Comments
 (0)