plugin/proxy: decrease health timeouts (#1107)

Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
author: Miek Gieben <miek@miek.nl> 2017-09-24 20:05:36 +0100
committer: GitHub <noreply@github.com> 2017-09-24 20:05:36 +0100
commit: 2a32cd415911bea79784bee3387103bd9576f438 (patch)
tree: 899ec04050ce5dc7138341a795e83651b5e01999
parent: 148a99442dcd00fcda53858ada95f8148a9edd14 (diff)
download: coredns-2a32cd415911bea79784bee3387103bd9576f438.tar.gz
coredns-2a32cd415911bea79784bee3387103bd9576f438.tar.zst
coredns-2a32cd415911bea79784bee3387103bd9576f438.zip
5 files changed, 41 insertions, 15 deletions
diff --git a/plugin/proxy/README.md b/plugin/proxy/README.md
index 484a2e7ae..c23df8c42 100644
--- a/plugin/proxy/README.md
+++ b/plugin/proxy/README.md
@@ -41,10 +41,10 @@ proxy FROM TO... {
   communicate with it. The default value is 10 seconds ("10s").
 * `max_fails` is the number of failures within fail_timeout that are needed before considering
   a backend to be down. If 0, the backend will never be marked as down. Default is 1.
-* `health_check` will check path (on port) on each backend. If a backend returns a status code of
+* `health_check` will check **PATH** (on **PORT**) on each backend. If a backend returns a status code of
   200-399, then that backend is marked healthy for double the healthcheck duration.  If it doesn't,
   it is marked as unhealthy and no requests are routed to it.  If this option is not provided then
-  health checks are disabled.  The default duration is 30 seconds ("30s").
+  health checks are disabled.  The default duration is 4 seconds ("4s").
 * **IGNORED_NAMES** in `except` is a space-separated list of domains to exclude from proxying.
   Requests that match none of these names will be passed through.
 * `spray` when all backends are unhealthy, randomly pick one to send the traffic to. (This is
diff --git a/plugin/proxy/google.go b/plugin/proxy/google.go
index d7248e149..91d4cba26 100644
--- a/plugin/proxy/google.go
+++ b/plugin/proxy/google.go
@@ -192,9 +192,9 @@ func newUpstream(hosts []string, old *staticUpstream) Upstream {
 	upstream := &staticUpstream{
 		from: old.from,
 		HealthCheck: healthcheck.HealthCheck{
-			FailTimeout: 10 * time.Second,
+			FailTimeout: 5 * time.Second,
 			MaxFails:    3,
-			Future:      60 * time.Second,
+			Future:      12 * time.Second,
 		},
 		ex:                old.ex,
 		IgnoredSubDomains: old.IgnoredSubDomains,
diff --git a/plugin/proxy/lookup.go b/plugin/proxy/lookup.go
index 238666608..fc0f3e01f 100644
--- a/plugin/proxy/lookup.go
+++ b/plugin/proxy/lookup.go
@@ -5,6 +5,7 @@ package proxy
 import (
 	"context"
 	"fmt"
+	"net"
 	"sync/atomic"
 	"time"
 
@@ -26,9 +27,9 @@ func NewLookupWithOption(hosts []string, opts Options) Proxy {
 	upstream := &staticUpstream{
 		from: ".",
 		HealthCheck: healthcheck.HealthCheck{
-			FailTimeout: 10 * time.Second,
-			MaxFails:    3, // TODO(miek): disable error checking for simple lookups?
-			Future:      60 * time.Second,
+			FailTimeout: 5 * time.Second,
+			MaxFails:    3,
+			Future:      12 * time.Second,
 		},
 		ex: newDNSExWithOption(opts),
 	}
@@ -85,7 +86,7 @@ func (p Proxy) lookup(state request.Request) (*dns.Msg, error) {
 			}
 
 			// duplicated from proxy.go, but with a twist, we don't write the
-			// reply back to the client, we return it and there is no monitoring.
+			// reply back to the client, we return it and there is no monitoring to update here.
 
 			atomic.AddInt64(&host.Conns, 1)
 
@@ -96,11 +97,20 @@ func (p Proxy) lookup(state request.Request) (*dns.Msg, error) {
 			if backendErr == nil {
 				return reply, nil
 			}
+
+			if oe, ok := backendErr.(*net.OpError); ok {
+				if oe.Timeout() { // see proxy.go for docs.
+					continue
+				}
+			}
+
 			timeout := host.FailTimeout
 			if timeout == 0 {
-				timeout = 10 * time.Second
+				timeout = 2 * time.Second
 			}
+
 			atomic.AddInt32(&host.Fails, 1)
+
 			go func(host *healthcheck.UpstreamHost, timeout time.Duration) {
 				time.Sleep(timeout)
 				atomic.AddInt32(&host.Fails, -1)
diff --git a/plugin/proxy/proxy.go b/plugin/proxy/proxy.go
index 9d1e1906b..b2c713a53 100644
--- a/plugin/proxy/proxy.go
+++ b/plugin/proxy/proxy.go
@@ -4,6 +4,7 @@ package proxy
 import (
 	"errors"
 	"fmt"
+	"net"
 	"sync/atomic"
 	"time"
 
@@ -56,7 +57,7 @@ type Upstream interface {
 
 // tryDuration is how long to try upstream hosts; failures result in
 // immediate retries until this duration ends or we get a nil host.
-var tryDuration = 60 * time.Second
+var tryDuration = 16 * time.Second
 
 // ServeDNS satisfies the plugin.Handler interface.
 func (p Proxy) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
@@ -112,11 +113,26 @@ func (p Proxy) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (
 				return 0, taperr
 			}
 
+			// A "ANY isc.org" query is being dropped by ISC's nameserver, we see this as a i/o timeout, but
+			// would then mark our upstream is being broken. We should not do this if we consider the error temporary.
+			// Of course it could really be that our upstream is broken
+			if oe, ok := backendErr.(*net.OpError); ok {
+				// Note this keeps looping and trying until tryDuration is hit, at which point our client
+				// might be long gone...
+				if oe.Timeout() {
+					// Our upstream's upstream is problably messing up, continue with next selected
+					// host - which my be the *same* one as we don't set any uh.Fails.
+					continue
+				}
+			}
+
 			timeout := host.FailTimeout
 			if timeout == 0 {
-				timeout = 10 * time.Second
+				timeout = 2 * time.Second
 			}
+
 			atomic.AddInt32(&host.Fails, 1)
+
 			go func(host *healthcheck.UpstreamHost, timeout time.Duration) {
 				time.Sleep(timeout)
 				atomic.AddInt32(&host.Fails, -1)
diff --git a/plugin/proxy/upstream.go b/plugin/proxy/upstream.go
index f7ad58ea8..0ab29de51 100644
--- a/plugin/proxy/upstream.go
+++ b/plugin/proxy/upstream.go
@@ -31,9 +31,9 @@ func NewStaticUpstreams(c *caddyfile.Dispenser) ([]Upstream, error) {
 		upstream := &staticUpstream{
 			from: ".",
 			HealthCheck: healthcheck.HealthCheck{
-				FailTimeout: 10 * time.Second,
-				MaxFails:    1,
-				Future:      60 * time.Second,
+				FailTimeout: 5 * time.Second,
+				MaxFails:    3,
+				Future:      12 * time.Second,
 			},
 			ex: newDNSEx(),
 		}
@@ -121,7 +121,7 @@ func parseBlock(c *caddyfile.Dispenser, u *staticUpstream) error {
 		if err != nil {
 			return err
 		}
-		u.HealthCheck.Interval = 30 * time.Second
+		u.HealthCheck.Interval = 4 * time.Second
 		if c.NextArg() {
 			dur, err := time.ParseDuration(c.Val())
 			if err != nil {
author	Miek Gieben <miek@miek.nl>	2017-09-24 20:05:36 +0100
committer	GitHub <noreply@github.com>	2017-09-24 20:05:36 +0100
commit	2a32cd415911bea79784bee3387103bd9576f438 (patch)
tree	899ec04050ce5dc7138341a795e83651b5e01999
parent	148a99442dcd00fcda53858ada95f8148a9edd14 (diff)
download	coredns-2a32cd415911bea79784bee3387103bd9576f438.tar.gz coredns-2a32cd415911bea79784bee3387103bd9576f438.tar.zst coredns-2a32cd415911bea79784bee3387103bd9576f438.zip