aboutsummaryrefslogtreecommitdiff
path: root/plugin
diff options
context:
space:
mode:
authorGravatar Ricky S <singh.sangh@gmail.com> 2020-02-04 05:19:48 -0800
committerGravatar GitHub <noreply@github.com> 2020-02-04 14:19:48 +0100
commitefbe4ac5e80f7c59528d878f910b5edaf8cd17e1 (patch)
tree5986dfb53525110207338625e2aa8d2150e2807c /plugin
parent22cd28a7987afc24161b110b550d3e62347d1626 (diff)
downloadcoredns-efbe4ac5e80f7c59528d878f910b5edaf8cd17e1.tar.gz
coredns-efbe4ac5e80f7c59528d878f910b5edaf8cd17e1.tar.zst
coredns-efbe4ac5e80f7c59528d878f910b5edaf8cd17e1.zip
Add exponential backoff to healthcheck (#3643)
Move exponential backoff initialization to Start() Signed-off-by: RickyRajinder <singh.sangh@gmail.com> Move comment Increase max interval and update README Remove trailing whitespace Change Start() param name back to interval
Diffstat (limited to 'plugin')
-rw-r--r--plugin/forward/README.md6
-rw-r--r--plugin/pkg/up/up.go41
2 files changed, 26 insertions, 21 deletions
diff --git a/plugin/forward/README.md b/plugin/forward/README.md
index cc1845377..b4307d8dd 100644
--- a/plugin/forward/README.md
+++ b/plugin/forward/README.md
@@ -9,8 +9,10 @@
The *forward* plugin re-uses already opened sockets to the upstreams. It supports UDP, TCP and
DNS-over-TLS and uses in band health checking.
-When it detects an error a health check is performed. This checks runs in a loop, every *0.5s*, for
-as long as the upstream reports unhealthy. Once healthy we stop health checking (until the next
+When it detects an error a health check is performed. This checks runs in a loop,
+starting with a *0.5s* interval and exponentially backing off with randomized intervals
+up to *60s* for as long as the upstream reports unhealthy. The exponential backoff
+will reset to *0.5s* after 15 minutes. Once healthy we stop health checking (until the next
error). The health checks use a recursive DNS query (`. IN NS`) to get upstream health. Any response
that is not a network error (REFUSED, NOTIMPL, SERVFAIL, etc) is taken as a healthy upstream. The
health check uses the same protocol as specified in **TO**. If `max_fails` is set to 0, no checking
diff --git a/plugin/pkg/up/up.go b/plugin/pkg/up/up.go
index 8f866311b..71c128234 100644
--- a/plugin/pkg/up/up.go
+++ b/plugin/pkg/up/up.go
@@ -5,6 +5,8 @@ package up
import (
"sync"
"time"
+
+ "github.com/cenkalti/backoff/v4"
)
// Probe is used to run a single Func until it returns true (indicating a target is healthy). If an Func
@@ -13,8 +15,7 @@ import (
type Probe struct {
sync.Mutex
inprogress int
- interval time.Duration
- max time.Duration
+ expBackoff backoff.ExponentialBackOff
}
// Func is used to determine if a target is alive. If so this function must return nil.
@@ -31,7 +32,13 @@ func (p *Probe) Do(f Func) {
return
}
p.inprogress = active
- interval := p.interval
+ interval := p.expBackoff.NextBackOff()
+ // If exponential backoff has reached the maximum elapsed time (15 minutes),
+ // reset it and try again
+ if interval == -1 {
+ p.expBackoff.Reset()
+ interval = p.expBackoff.NextBackOff()
+ }
p.Unlock()
// Passed the lock. Now run f for as long it returns false. If a true is returned
// we return from the goroutine and we can accept another Func to run.
@@ -42,9 +49,6 @@ func (p *Probe) Do(f Func) {
break
}
time.Sleep(interval)
- if i%2 == 0 && i < 4 { // 4 is 2 doubles, so no need to increase anymore - this is *also* checked in double()
- p.double()
- }
p.Lock()
if p.inprogress == stop {
p.Unlock()
@@ -60,15 +64,6 @@ func (p *Probe) Do(f Func) {
}()
}
-func (p *Probe) double() {
- p.Lock()
- p.interval *= 2
- if p.interval > p.max {
- p.interval = p.max
- }
- p.Unlock()
-}
-
// Stop stops the probing.
func (p *Probe) Stop() {
p.Lock()
@@ -77,10 +72,20 @@ func (p *Probe) Stop() {
}
// Start will initialize the probe manager, after which probes can be initiated with Do.
+// Initializes exponential backoff using the given interval duration
func (p *Probe) Start(interval time.Duration) {
p.Lock()
- p.interval = interval
- p.max = interval * multiplier
+ eB := &backoff.ExponentialBackOff{
+ InitialInterval: interval,
+ RandomizationFactor: backoff.DefaultRandomizationFactor,
+ Multiplier: backoff.DefaultMultiplier,
+ MaxInterval: backoff.DefaultMaxInterval,
+ MaxElapsedTime: backoff.DefaultMaxElapsedTime,
+ Stop: backoff.Stop,
+ Clock: backoff.SystemClock,
+ }
+ p.expBackoff = *eB
+ p.expBackoff.Reset()
p.Unlock()
}
@@ -88,6 +93,4 @@ const (
idle = iota
active
stop
-
- multiplier = 4
)