aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Ricky S <singh.sangh@gmail.com> 2020-02-04 05:19:48 -0800
committerGravatar GitHub <noreply@github.com> 2020-02-04 14:19:48 +0100
commitefbe4ac5e80f7c59528d878f910b5edaf8cd17e1 (patch)
tree5986dfb53525110207338625e2aa8d2150e2807c
parent22cd28a7987afc24161b110b550d3e62347d1626 (diff)
downloadcoredns-efbe4ac5e80f7c59528d878f910b5edaf8cd17e1.tar.gz
coredns-efbe4ac5e80f7c59528d878f910b5edaf8cd17e1.tar.zst
coredns-efbe4ac5e80f7c59528d878f910b5edaf8cd17e1.zip
Add exponential backoff to healthcheck (#3643)
Move exponential backoff initialization to Start() Signed-off-by: RickyRajinder <singh.sangh@gmail.com> Move comment Increase max interval and update README Remove trailing whitespace Change Start() param name back to interval
-rw-r--r--go.mod1
-rw-r--r--go.sum2
-rw-r--r--plugin/forward/README.md6
-rw-r--r--plugin/pkg/up/up.go41
4 files changed, 29 insertions, 21 deletions
diff --git a/go.mod b/go.mod
index a792a88d8..067bf9653 100644
--- a/go.mod
+++ b/go.mod
@@ -12,6 +12,7 @@ require (
github.com/apache/thrift v0.13.0 // indirect
github.com/aws/aws-sdk-go v1.28.9
github.com/caddyserver/caddy v1.0.4
+ github.com/cenkalti/backoff/v4 v4.0.0
github.com/coredns/federation v0.0.0-20190818181423-e032b096babe
github.com/coreos/go-systemd v0.0.0-20190212144455-93d5ec2c7f76 // indirect
github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f // indirect
diff --git a/go.sum b/go.sum
index 41a053011..9cbe8fe2e 100644
--- a/go.sum
+++ b/go.sum
@@ -78,6 +78,8 @@ github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kB
github.com/caddyserver/caddy v1.0.4 h1:wwuGSkUHo6RZ3oMpeTt7J09WBB87X5o+IZN4dKehcQE=
github.com/caddyserver/caddy v1.0.4/go.mod h1:uruyfVsyMcDb3IOzSKsi1x0wOjy1my/PxOSTcD+24jM=
github.com/cenkalti/backoff/v3 v3.0.0/go.mod h1:cIeZDE3IrqwwJl6VUwCN6trj1oXrTS4rc0ij+ULvLYs=
+github.com/cenkalti/backoff/v4 v4.0.0 h1:6VeaLF9aI+MAUQ95106HwWzYZgJJpZ4stumjj6RFYAU=
+github.com/cenkalti/backoff/v4 v4.0.0/go.mod h1:eEew/i+1Q6OrCDZh3WiXYv3+nJwBASZ8Bog/87DQnVg=
github.com/census-instrumentation/opencensus-proto v0.2.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY=
diff --git a/plugin/forward/README.md b/plugin/forward/README.md
index cc1845377..b4307d8dd 100644
--- a/plugin/forward/README.md
+++ b/plugin/forward/README.md
@@ -9,8 +9,10 @@
The *forward* plugin re-uses already opened sockets to the upstreams. It supports UDP, TCP and
DNS-over-TLS and uses in band health checking.
-When it detects an error a health check is performed. This checks runs in a loop, every *0.5s*, for
-as long as the upstream reports unhealthy. Once healthy we stop health checking (until the next
+When it detects an error a health check is performed. This checks runs in a loop,
+starting with a *0.5s* interval and exponentially backing off with randomized intervals
+up to *60s* for as long as the upstream reports unhealthy. The exponential backoff
+will reset to *0.5s* after 15 minutes. Once healthy we stop health checking (until the next
error). The health checks use a recursive DNS query (`. IN NS`) to get upstream health. Any response
that is not a network error (REFUSED, NOTIMPL, SERVFAIL, etc) is taken as a healthy upstream. The
health check uses the same protocol as specified in **TO**. If `max_fails` is set to 0, no checking
diff --git a/plugin/pkg/up/up.go b/plugin/pkg/up/up.go
index 8f866311b..71c128234 100644
--- a/plugin/pkg/up/up.go
+++ b/plugin/pkg/up/up.go
@@ -5,6 +5,8 @@ package up
import (
"sync"
"time"
+
+ "github.com/cenkalti/backoff/v4"
)
// Probe is used to run a single Func until it returns true (indicating a target is healthy). If an Func
@@ -13,8 +15,7 @@ import (
type Probe struct {
sync.Mutex
inprogress int
- interval time.Duration
- max time.Duration
+ expBackoff backoff.ExponentialBackOff
}
// Func is used to determine if a target is alive. If so this function must return nil.
@@ -31,7 +32,13 @@ func (p *Probe) Do(f Func) {
return
}
p.inprogress = active
- interval := p.interval
+ interval := p.expBackoff.NextBackOff()
+ // If exponential backoff has reached the maximum elapsed time (15 minutes),
+ // reset it and try again
+ if interval == -1 {
+ p.expBackoff.Reset()
+ interval = p.expBackoff.NextBackOff()
+ }
p.Unlock()
// Passed the lock. Now run f for as long it returns false. If a true is returned
// we return from the goroutine and we can accept another Func to run.
@@ -42,9 +49,6 @@ func (p *Probe) Do(f Func) {
break
}
time.Sleep(interval)
- if i%2 == 0 && i < 4 { // 4 is 2 doubles, so no need to increase anymore - this is *also* checked in double()
- p.double()
- }
p.Lock()
if p.inprogress == stop {
p.Unlock()
@@ -60,15 +64,6 @@ func (p *Probe) Do(f Func) {
}()
}
-func (p *Probe) double() {
- p.Lock()
- p.interval *= 2
- if p.interval > p.max {
- p.interval = p.max
- }
- p.Unlock()
-}
-
// Stop stops the probing.
func (p *Probe) Stop() {
p.Lock()
@@ -77,10 +72,20 @@ func (p *Probe) Stop() {
}
// Start will initialize the probe manager, after which probes can be initiated with Do.
+// Initializes exponential backoff using the given interval duration
func (p *Probe) Start(interval time.Duration) {
p.Lock()
- p.interval = interval
- p.max = interval * multiplier
+ eB := &backoff.ExponentialBackOff{
+ InitialInterval: interval,
+ RandomizationFactor: backoff.DefaultRandomizationFactor,
+ Multiplier: backoff.DefaultMultiplier,
+ MaxInterval: backoff.DefaultMaxInterval,
+ MaxElapsedTime: backoff.DefaultMaxElapsedTime,
+ Stop: backoff.Stop,
+ Clock: backoff.SystemClock,
+ }
+ p.expBackoff = *eB
+ p.expBackoff.Reset()
p.Unlock()
}
@@ -88,6 +93,4 @@ const (
idle = iota
active
stop
-
- multiplier = 4
)