plugin/cache: Add refresh mode setting to serve_stale (#5131)

This PR adds an optional REFRESH_MODE parameter on the serve_stale configuration directive of the cache plugin, which verifies that the upstream is still unavailable before returning stale entries. Signed-off-by: Antoine Tollenaere <atollena@gmail.com>
author: Antoine Tollenaere <atollena@gmail.com> 2022-05-02 19:16:33 +0200
committer: GitHub <noreply@github.com> 2022-05-02 13:16:33 -0400
commit: 66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64 (patch)
tree: 0eb7a1a25907a398f1d3d647c4895749b4908803
parent: c3572fdb30446b6e625113ee0329cc80810afd6a (diff)
download: coredns-66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64.tar.gz
coredns-66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64.tar.zst
coredns-66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64.zip
6 files changed, 170 insertions, 25 deletions
diff --git a/plugin/cache/README.md b/plugin/cache/README.md
index 92c231be7..98363abbe 100644
--- a/plugin/cache/README.md
+++ b/plugin/cache/README.md
@@ -37,7 +37,7 @@ cache [TTL] [ZONES...] {
     success CAPACITY [TTL] [MINTTL]
     denial CAPACITY [TTL] [MINTTL]
     prefetch AMOUNT [[DURATION] [PERCENTAGE%]]
-    serve_stale [DURATION]
+    serve_stale [DURATION] [REFRESH_MODE]
 }
 ~~~
 
@@ -57,7 +57,12 @@ cache [TTL] [ZONES...] {
 * `serve_stale`, when serve\_stale is set, cache always will serve an expired entry to a client if there is one
   available.  When this happens, cache will attempt to refresh the cache entry after sending the expired cache
   entry to the client. The responses have a TTL of 0. **DURATION** is how far back to consider
-  stale responses as fresh. The default duration is 1h.
+  stale responses as fresh. The default duration is 1h. **REFRESH_MODE** controls when the attempt to refresh
+  the cache happens. `verified` will first verify that an entry is still unavailable from the source before sending
+  the stale response to the client. `immediate` will immediately send the expired response to the client before
+  checking to see if the entry is available from the source. **REFRESH_MODE** defaults to `immediate`. Setting this
+  value to `verified` can lead to increased latency when serving stale responses, but will prevent stale entries
+  from ever being served if an updated response can be retrieved from the source.
 
 ## Capacity and Eviction
 
diff --git a/plugin/cache/cache.go b/plugin/cache/cache.go
index 59439653f..58a73e72c 100644
--- a/plugin/cache/cache.go
+++ b/plugin/cache/cache.go
@@ -38,7 +38,9 @@ type Cache struct {
 	duration   time.Duration
 	percentage int
 
-	staleUpTo time.Duration
+	// Stale serve
+	staleUpTo   time.Duration
+	verifyStale bool
 
 	// Testing.
 	now func() time.Time
@@ -227,6 +229,33 @@ func (w *ResponseWriter) Write(buf []byte) (int, error) {
 	return n, err
 }
 
+// verifyStaleResponseWriter is a response writer that only writes messages if they should replace a
+// stale cache entry, and otherwise discards them.
+type verifyStaleResponseWriter struct {
+	*ResponseWriter
+	refreshed bool // set to true if the last WriteMsg wrote to ResponseWriter, false otherwise.
+}
+
+// newVerifyStaleResponseWriter returns a ResponseWriter to be used when verifying stale cache
+// entries. It only forward writes if an entry was successfully refreshed according to RFC8767,
+// section 4 (response is NoError or NXDomain), and ignores any other response.
+func newVerifyStaleResponseWriter(w *ResponseWriter) *verifyStaleResponseWriter {
+	return &verifyStaleResponseWriter{
+		w,
+		false,
+	}
+}
+
+// WriteMsg implements the dns.ResponseWriter interface.
+func (w *verifyStaleResponseWriter) WriteMsg(res *dns.Msg) error {
+	w.refreshed = false
+	if res.Rcode == dns.RcodeSuccess || res.Rcode == dns.RcodeNameError {
+		w.refreshed = true
+		return w.ResponseWriter.WriteMsg(res) // stores to the cache and send to client
+	}
+	return nil // else discard
+}
+
 const (
 	maxTTL  = dnsutil.MaximumDefaulTTL
 	minTTL  = dnsutil.MinimalDefaultTTL
diff --git a/plugin/cache/cache_test.go b/plugin/cache/cache_test.go
index d839ea1a3..7f8c28e3f 100644
--- a/plugin/cache/cache_test.go
+++ b/plugin/cache/cache_test.go
@@ -266,7 +266,7 @@ func TestServeFromStaleCache(t *testing.T) {
 	req.SetQuestion("cached.org.", dns.TypeA)
 	ctx := context.TODO()
 
-	// Cache example.org.
+	// Cache cached.org. with 60s TTL
 	rec := dnstest.NewRecorder(&test.ResponseWriter{})
 	c.staleUpTo = 1 * time.Hour
 	c.ServeDNS(ctx, rec, req)
@@ -304,6 +304,80 @@ func TestServeFromStaleCache(t *testing.T) {
 	}
 }
 
+func TestServeFromStaleCacheFetchVerify(t *testing.T) {
+	c := New()
+	c.Next = ttlBackend(120)
+
+	req := new(dns.Msg)
+	req.SetQuestion("cached.org.", dns.TypeA)
+	ctx := context.TODO()
+
+	// Cache cached.org. with 120s TTL
+	rec := dnstest.NewRecorder(&test.ResponseWriter{})
+	c.staleUpTo = 1 * time.Hour
+	c.verifyStale = true
+	c.ServeDNS(ctx, rec, req)
+	if c.pcache.Len() != 1 {
+		t.Fatalf("Msg with > 0 TTL should have been cached")
+	}
+
+	tests := []struct {
+		name          string
+		upstreamRCode int
+		upstreamTtl   int
+		futureMinutes int
+		expectedRCode int
+		expectedTtl   int
+	}{
+		// After 1 minutes of initial TTL, we should see a cached response
+		{"cached.org.", dns.RcodeSuccess, 200, 1, dns.RcodeSuccess, 60}, // ttl = 120 - 60 -- not refreshed
+
+		// After the 2 more minutes, we should see upstream responses because upstream is available
+		{"cached.org.", dns.RcodeSuccess, 200, 3, dns.RcodeSuccess, 200},
+
+		// After the TTL expired, if the server fails we should get the cached entry
+		{"cached.org.", dns.RcodeServerFailure, 200, 7, dns.RcodeSuccess, 0},
+
+		// After 1 more minutes, if the server serves nxdomain we should see them (despite being within the serve stale period)
+		{"cached.org.", dns.RcodeNameError, 150, 8, dns.RcodeNameError, 150},
+	}
+
+	for i, tt := range tests {
+		rec := dnstest.NewRecorder(&test.ResponseWriter{})
+		c.now = func() time.Time { return time.Now().Add(time.Duration(tt.futureMinutes) * time.Minute) }
+
+		if tt.upstreamRCode == dns.RcodeSuccess {
+			c.Next = ttlBackend(tt.upstreamTtl)
+		} else if tt.upstreamRCode == dns.RcodeServerFailure {
+			// Make upstream fail, should now rely on cache during the c.staleUpTo period
+			c.Next = servFailBackend(tt.upstreamTtl)
+		} else if tt.upstreamRCode == dns.RcodeNameError {
+			c.Next = nxDomainBackend(tt.upstreamTtl)
+		} else {
+			t.Fatal("upstream code not implemented")
+		}
+
+		r := req.Copy()
+		r.SetQuestion(tt.name, dns.TypeA)
+		ret, _ := c.ServeDNS(ctx, rec, r)
+		if ret != tt.expectedRCode {
+			t.Errorf("Test %d: expected rcode=%v, got rcode=%v", i, tt.expectedRCode, ret)
+			continue
+		}
+		if ret == dns.RcodeSuccess {
+			recTtl := rec.Msg.Answer[0].Header().Ttl
+			if tt.expectedTtl != int(recTtl) {
+				t.Errorf("Test %d: expected TTL=%d, got TTL=%d", i, tt.expectedTtl, recTtl)
+			}
+		} else if ret == dns.RcodeNameError {
+			soaTtl := rec.Msg.Ns[0].Header().Ttl
+			if tt.expectedTtl != int(soaTtl) {
+				t.Errorf("Test %d: expected TTL=%d, got TTL=%d", i, tt.expectedTtl, soaTtl)
+			}
+		}
+	}
+}
+
 func TestNegativeStaleMaskingPositiveCache(t *testing.T) {
 	c := New()
 	c.staleUpTo = time.Minute * 10
@@ -454,6 +528,20 @@ func ttlBackend(ttl int) plugin.Handler {
 	})
 }
 
+func servFailBackend(ttl int) plugin.Handler {
+	return plugin.HandlerFunc(func(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
+		m := new(dns.Msg)
+		m.SetReply(r)
+		m.Response, m.RecursionAvailable = true, true
+
+		m.Ns = []dns.RR{test.SOA(fmt.Sprintf("example.org. %d IN	SOA	sns.dns.icann.org. noc.dns.icann.org. 2016082540 7200 3600 1209600 3600", ttl))}
+
+		m.MsgHdr.Rcode = dns.RcodeServerFailure
+		w.WriteMsg(m)
+		return dns.RcodeServerFailure, nil
+	})
+}
+
 func TestComputeTTL(t *testing.T) {
 	tests := []struct {
 		msgTTL      time.Duration
diff --git a/plugin/cache/handler.go b/plugin/cache/handler.go
index 2b4c89350..d5112fc69 100644
--- a/plugin/cache/handler.go
+++ b/plugin/cache/handler.go
@@ -35,19 +35,29 @@ func (c *Cache) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg)
 
 	ttl := 0
 	i := c.getIgnoreTTL(now, state, server)
-	if i != nil {
-		ttl = i.ttl(now)
-	}
 	if i == nil {
 		crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do}
 		return c.doRefresh(ctx, state, crr)
 	}
+	ttl = i.ttl(now)
 	if ttl < 0 {
-		servedStale.WithLabelValues(server, c.zonesMetricLabel).Inc()
+		// serve stale behavior
+		if c.verifyStale {
+			crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do}
+			cw := newVerifyStaleResponseWriter(crr)
+			ret, err := c.doRefresh(ctx, state, cw)
+			if cw.refreshed {
+				return ret, err
+			}
+		}
+
 		// Adjust the time to get a 0 TTL in the reply built from a stale item.
 		now = now.Add(time.Duration(ttl) * time.Second)
-		cw := newPrefetchResponseWriter(server, state, c)
-		go c.doPrefetch(ctx, state, cw, i, now)
+		if !c.verifyStale {
+			cw := newPrefetchResponseWriter(server, state, c)
+			go c.doPrefetch(ctx, state, cw, i, now)
+		}
+		servedStale.WithLabelValues(server, c.zonesMetricLabel).Inc()
 	} else if c.shouldPrefetch(i, now) {
 		cw := newPrefetchResponseWriter(server, state, c)
 		go c.doPrefetch(ctx, state, cw, i, now)
@@ -70,7 +80,7 @@ func (c *Cache) doPrefetch(ctx context.Context, state request.Request, cw *Respo
 	}
 }
 
-func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw *ResponseWriter) (int, error) {
+func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw dns.ResponseWriter) (int, error) {
 	if !state.Do() {
 		setDo(state.Req)
 	}
diff --git a/plugin/cache/setup.go b/plugin/cache/setup.go
index afbf361c5..e5258dc06 100644
--- a/plugin/cache/setup.go
+++ b/plugin/cache/setup.go
@@ -166,11 +166,11 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
 
 			case "serve_stale":
 				args := c.RemainingArgs()
-				if len(args) > 1 {
+				if len(args) > 2 {
 					return nil, c.ArgErr()
 				}
 				ca.staleUpTo = 1 * time.Hour
-				if len(args) == 1 {
+				if len(args) > 0 {
 					d, err := time.ParseDuration(args[0])
 					if err != nil {
 						return nil, err
@@ -180,6 +180,14 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
 					}
 					ca.staleUpTo = d
 				}
+				ca.verifyStale = false
+				if len(args) > 1 {
+					mode := strings.ToLower(args[1])
+					if mode != "immediate" && mode != "verify" {
+						return nil, fmt.Errorf("invalid value for serve_stale refresh mode: %s", mode)
+					}
+					ca.verifyStale = mode == "verify"
+				}
 			default:
 				return nil, c.ArgErr()
 			}
diff --git a/plugin/cache/setup_test.go b/plugin/cache/setup_test.go
index 875af7d03..675147d1b 100644
--- a/plugin/cache/setup_test.go
+++ b/plugin/cache/setup_test.go
@@ -117,20 +117,25 @@ func TestSetup(t *testing.T) {
 
 func TestServeStale(t *testing.T) {
 	tests := []struct {
-		input     string
-		shouldErr bool
-		staleUpTo time.Duration
+		input       string
+		shouldErr   bool
+		staleUpTo   time.Duration
+		verifyStale bool
 	}{
-		{"serve_stale", false, 1 * time.Hour},
-		{"serve_stale 20m", false, 20 * time.Minute},
-		{"serve_stale 1h20m", false, 80 * time.Minute},
-		{"serve_stale 0m", false, 0},
-		{"serve_stale 0", false, 0},
+		{"serve_stale", false, 1 * time.Hour, false},
+		{"serve_stale 20m", false, 20 * time.Minute, false},
+		{"serve_stale 1h20m", false, 80 * time.Minute, false},
+		{"serve_stale 0m", false, 0, false},
+		{"serve_stale 0", false, 0, false},
+		{"serve_stale 0 verify", false, 0, true},
+		{"serve_stale 0 immediate", false, 0, false},
+		{"serve_stale 0 VERIFY", false, 0, true},
 		// fails
-		{"serve_stale 20", true, 0},
-		{"serve_stale -20m", true, 0},
-		{"serve_stale aa", true, 0},
-		{"serve_stale 1m nono", true, 0},
+		{"serve_stale 20", true, 0, false},
+		{"serve_stale -20m", true, 0, false},
+		{"serve_stale aa", true, 0, false},
+		{"serve_stale 1m nono", true, 0, false},
+		{"serve_stale 0 after nono", true, 0, false},
 	}
 	for i, test := range tests {
 		c := caddy.NewTestController("dns", fmt.Sprintf("cache {\n%s\n}", test.input))
author	Antoine Tollenaere <atollena@gmail.com>	2022-05-02 19:16:33 +0200
committer	GitHub <noreply@github.com>	2022-05-02 13:16:33 -0400
commit	66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64 (patch)
tree	0eb7a1a25907a398f1d3d647c4895749b4908803
parent	c3572fdb30446b6e625113ee0329cc80810afd6a (diff)
download	coredns-66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64.tar.gz coredns-66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64.tar.zst coredns-66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64.zip