aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Antoine Tollenaere <atollena@gmail.com> 2022-05-02 19:16:33 +0200
committerGravatar GitHub <noreply@github.com> 2022-05-02 13:16:33 -0400
commit66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64 (patch)
tree0eb7a1a25907a398f1d3d647c4895749b4908803
parentc3572fdb30446b6e625113ee0329cc80810afd6a (diff)
downloadcoredns-66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64.tar.gz
coredns-66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64.tar.zst
coredns-66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64.zip
plugin/cache: Add refresh mode setting to serve_stale (#5131)
This PR adds an optional REFRESH_MODE parameter on the serve_stale configuration directive of the cache plugin, which verifies that the upstream is still unavailable before returning stale entries. Signed-off-by: Antoine Tollenaere <atollena@gmail.com>
-rw-r--r--plugin/cache/README.md9
-rw-r--r--plugin/cache/cache.go31
-rw-r--r--plugin/cache/cache_test.go90
-rw-r--r--plugin/cache/handler.go24
-rw-r--r--plugin/cache/setup.go12
-rw-r--r--plugin/cache/setup_test.go29
6 files changed, 170 insertions, 25 deletions
diff --git a/plugin/cache/README.md b/plugin/cache/README.md
index 92c231be7..98363abbe 100644
--- a/plugin/cache/README.md
+++ b/plugin/cache/README.md
@@ -37,7 +37,7 @@ cache [TTL] [ZONES...] {
success CAPACITY [TTL] [MINTTL]
denial CAPACITY [TTL] [MINTTL]
prefetch AMOUNT [[DURATION] [PERCENTAGE%]]
- serve_stale [DURATION]
+ serve_stale [DURATION] [REFRESH_MODE]
}
~~~
@@ -57,7 +57,12 @@ cache [TTL] [ZONES...] {
* `serve_stale`, when serve\_stale is set, cache always will serve an expired entry to a client if there is one
available. When this happens, cache will attempt to refresh the cache entry after sending the expired cache
entry to the client. The responses have a TTL of 0. **DURATION** is how far back to consider
- stale responses as fresh. The default duration is 1h.
+ stale responses as fresh. The default duration is 1h. **REFRESH_MODE** controls when the attempt to refresh
+ the cache happens. `verified` will first verify that an entry is still unavailable from the source before sending
+ the stale response to the client. `immediate` will immediately send the expired response to the client before
+ checking to see if the entry is available from the source. **REFRESH_MODE** defaults to `immediate`. Setting this
+ value to `verified` can lead to increased latency when serving stale responses, but will prevent stale entries
+ from ever being served if an updated response can be retrieved from the source.
## Capacity and Eviction
diff --git a/plugin/cache/cache.go b/plugin/cache/cache.go
index 59439653f..58a73e72c 100644
--- a/plugin/cache/cache.go
+++ b/plugin/cache/cache.go
@@ -38,7 +38,9 @@ type Cache struct {
duration time.Duration
percentage int
- staleUpTo time.Duration
+ // Stale serve
+ staleUpTo time.Duration
+ verifyStale bool
// Testing.
now func() time.Time
@@ -227,6 +229,33 @@ func (w *ResponseWriter) Write(buf []byte) (int, error) {
return n, err
}
+// verifyStaleResponseWriter is a response writer that only writes messages if they should replace a
+// stale cache entry, and otherwise discards them.
+type verifyStaleResponseWriter struct {
+ *ResponseWriter
+ refreshed bool // set to true if the last WriteMsg wrote to ResponseWriter, false otherwise.
+}
+
+// newVerifyStaleResponseWriter returns a ResponseWriter to be used when verifying stale cache
+// entries. It only forward writes if an entry was successfully refreshed according to RFC8767,
+// section 4 (response is NoError or NXDomain), and ignores any other response.
+func newVerifyStaleResponseWriter(w *ResponseWriter) *verifyStaleResponseWriter {
+ return &verifyStaleResponseWriter{
+ w,
+ false,
+ }
+}
+
+// WriteMsg implements the dns.ResponseWriter interface.
+func (w *verifyStaleResponseWriter) WriteMsg(res *dns.Msg) error {
+ w.refreshed = false
+ if res.Rcode == dns.RcodeSuccess || res.Rcode == dns.RcodeNameError {
+ w.refreshed = true
+ return w.ResponseWriter.WriteMsg(res) // stores to the cache and send to client
+ }
+ return nil // else discard
+}
+
const (
maxTTL = dnsutil.MaximumDefaulTTL
minTTL = dnsutil.MinimalDefaultTTL
diff --git a/plugin/cache/cache_test.go b/plugin/cache/cache_test.go
index d839ea1a3..7f8c28e3f 100644
--- a/plugin/cache/cache_test.go
+++ b/plugin/cache/cache_test.go
@@ -266,7 +266,7 @@ func TestServeFromStaleCache(t *testing.T) {
req.SetQuestion("cached.org.", dns.TypeA)
ctx := context.TODO()
- // Cache example.org.
+ // Cache cached.org. with 60s TTL
rec := dnstest.NewRecorder(&test.ResponseWriter{})
c.staleUpTo = 1 * time.Hour
c.ServeDNS(ctx, rec, req)
@@ -304,6 +304,80 @@ func TestServeFromStaleCache(t *testing.T) {
}
}
+func TestServeFromStaleCacheFetchVerify(t *testing.T) {
+ c := New()
+ c.Next = ttlBackend(120)
+
+ req := new(dns.Msg)
+ req.SetQuestion("cached.org.", dns.TypeA)
+ ctx := context.TODO()
+
+ // Cache cached.org. with 120s TTL
+ rec := dnstest.NewRecorder(&test.ResponseWriter{})
+ c.staleUpTo = 1 * time.Hour
+ c.verifyStale = true
+ c.ServeDNS(ctx, rec, req)
+ if c.pcache.Len() != 1 {
+ t.Fatalf("Msg with > 0 TTL should have been cached")
+ }
+
+ tests := []struct {
+ name string
+ upstreamRCode int
+ upstreamTtl int
+ futureMinutes int
+ expectedRCode int
+ expectedTtl int
+ }{
+ // After 1 minutes of initial TTL, we should see a cached response
+ {"cached.org.", dns.RcodeSuccess, 200, 1, dns.RcodeSuccess, 60}, // ttl = 120 - 60 -- not refreshed
+
+ // After the 2 more minutes, we should see upstream responses because upstream is available
+ {"cached.org.", dns.RcodeSuccess, 200, 3, dns.RcodeSuccess, 200},
+
+ // After the TTL expired, if the server fails we should get the cached entry
+ {"cached.org.", dns.RcodeServerFailure, 200, 7, dns.RcodeSuccess, 0},
+
+ // After 1 more minutes, if the server serves nxdomain we should see them (despite being within the serve stale period)
+ {"cached.org.", dns.RcodeNameError, 150, 8, dns.RcodeNameError, 150},
+ }
+
+ for i, tt := range tests {
+ rec := dnstest.NewRecorder(&test.ResponseWriter{})
+ c.now = func() time.Time { return time.Now().Add(time.Duration(tt.futureMinutes) * time.Minute) }
+
+ if tt.upstreamRCode == dns.RcodeSuccess {
+ c.Next = ttlBackend(tt.upstreamTtl)
+ } else if tt.upstreamRCode == dns.RcodeServerFailure {
+ // Make upstream fail, should now rely on cache during the c.staleUpTo period
+ c.Next = servFailBackend(tt.upstreamTtl)
+ } else if tt.upstreamRCode == dns.RcodeNameError {
+ c.Next = nxDomainBackend(tt.upstreamTtl)
+ } else {
+ t.Fatal("upstream code not implemented")
+ }
+
+ r := req.Copy()
+ r.SetQuestion(tt.name, dns.TypeA)
+ ret, _ := c.ServeDNS(ctx, rec, r)
+ if ret != tt.expectedRCode {
+ t.Errorf("Test %d: expected rcode=%v, got rcode=%v", i, tt.expectedRCode, ret)
+ continue
+ }
+ if ret == dns.RcodeSuccess {
+ recTtl := rec.Msg.Answer[0].Header().Ttl
+ if tt.expectedTtl != int(recTtl) {
+ t.Errorf("Test %d: expected TTL=%d, got TTL=%d", i, tt.expectedTtl, recTtl)
+ }
+ } else if ret == dns.RcodeNameError {
+ soaTtl := rec.Msg.Ns[0].Header().Ttl
+ if tt.expectedTtl != int(soaTtl) {
+ t.Errorf("Test %d: expected TTL=%d, got TTL=%d", i, tt.expectedTtl, soaTtl)
+ }
+ }
+ }
+}
+
func TestNegativeStaleMaskingPositiveCache(t *testing.T) {
c := New()
c.staleUpTo = time.Minute * 10
@@ -454,6 +528,20 @@ func ttlBackend(ttl int) plugin.Handler {
})
}
+func servFailBackend(ttl int) plugin.Handler {
+ return plugin.HandlerFunc(func(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
+ m := new(dns.Msg)
+ m.SetReply(r)
+ m.Response, m.RecursionAvailable = true, true
+
+ m.Ns = []dns.RR{test.SOA(fmt.Sprintf("example.org. %d IN SOA sns.dns.icann.org. noc.dns.icann.org. 2016082540 7200 3600 1209600 3600", ttl))}
+
+ m.MsgHdr.Rcode = dns.RcodeServerFailure
+ w.WriteMsg(m)
+ return dns.RcodeServerFailure, nil
+ })
+}
+
func TestComputeTTL(t *testing.T) {
tests := []struct {
msgTTL time.Duration
diff --git a/plugin/cache/handler.go b/plugin/cache/handler.go
index 2b4c89350..d5112fc69 100644
--- a/plugin/cache/handler.go
+++ b/plugin/cache/handler.go
@@ -35,19 +35,29 @@ func (c *Cache) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg)
ttl := 0
i := c.getIgnoreTTL(now, state, server)
- if i != nil {
- ttl = i.ttl(now)
- }
if i == nil {
crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do}
return c.doRefresh(ctx, state, crr)
}
+ ttl = i.ttl(now)
if ttl < 0 {
- servedStale.WithLabelValues(server, c.zonesMetricLabel).Inc()
+ // serve stale behavior
+ if c.verifyStale {
+ crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do}
+ cw := newVerifyStaleResponseWriter(crr)
+ ret, err := c.doRefresh(ctx, state, cw)
+ if cw.refreshed {
+ return ret, err
+ }
+ }
+
// Adjust the time to get a 0 TTL in the reply built from a stale item.
now = now.Add(time.Duration(ttl) * time.Second)
- cw := newPrefetchResponseWriter(server, state, c)
- go c.doPrefetch(ctx, state, cw, i, now)
+ if !c.verifyStale {
+ cw := newPrefetchResponseWriter(server, state, c)
+ go c.doPrefetch(ctx, state, cw, i, now)
+ }
+ servedStale.WithLabelValues(server, c.zonesMetricLabel).Inc()
} else if c.shouldPrefetch(i, now) {
cw := newPrefetchResponseWriter(server, state, c)
go c.doPrefetch(ctx, state, cw, i, now)
@@ -70,7 +80,7 @@ func (c *Cache) doPrefetch(ctx context.Context, state request.Request, cw *Respo
}
}
-func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw *ResponseWriter) (int, error) {
+func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw dns.ResponseWriter) (int, error) {
if !state.Do() {
setDo(state.Req)
}
diff --git a/plugin/cache/setup.go b/plugin/cache/setup.go
index afbf361c5..e5258dc06 100644
--- a/plugin/cache/setup.go
+++ b/plugin/cache/setup.go
@@ -166,11 +166,11 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
case "serve_stale":
args := c.RemainingArgs()
- if len(args) > 1 {
+ if len(args) > 2 {
return nil, c.ArgErr()
}
ca.staleUpTo = 1 * time.Hour
- if len(args) == 1 {
+ if len(args) > 0 {
d, err := time.ParseDuration(args[0])
if err != nil {
return nil, err
@@ -180,6 +180,14 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
}
ca.staleUpTo = d
}
+ ca.verifyStale = false
+ if len(args) > 1 {
+ mode := strings.ToLower(args[1])
+ if mode != "immediate" && mode != "verify" {
+ return nil, fmt.Errorf("invalid value for serve_stale refresh mode: %s", mode)
+ }
+ ca.verifyStale = mode == "verify"
+ }
default:
return nil, c.ArgErr()
}
diff --git a/plugin/cache/setup_test.go b/plugin/cache/setup_test.go
index 875af7d03..675147d1b 100644
--- a/plugin/cache/setup_test.go
+++ b/plugin/cache/setup_test.go
@@ -117,20 +117,25 @@ func TestSetup(t *testing.T) {
func TestServeStale(t *testing.T) {
tests := []struct {
- input string
- shouldErr bool
- staleUpTo time.Duration
+ input string
+ shouldErr bool
+ staleUpTo time.Duration
+ verifyStale bool
}{
- {"serve_stale", false, 1 * time.Hour},
- {"serve_stale 20m", false, 20 * time.Minute},
- {"serve_stale 1h20m", false, 80 * time.Minute},
- {"serve_stale 0m", false, 0},
- {"serve_stale 0", false, 0},
+ {"serve_stale", false, 1 * time.Hour, false},
+ {"serve_stale 20m", false, 20 * time.Minute, false},
+ {"serve_stale 1h20m", false, 80 * time.Minute, false},
+ {"serve_stale 0m", false, 0, false},
+ {"serve_stale 0", false, 0, false},
+ {"serve_stale 0 verify", false, 0, true},
+ {"serve_stale 0 immediate", false, 0, false},
+ {"serve_stale 0 VERIFY", false, 0, true},
// fails
- {"serve_stale 20", true, 0},
- {"serve_stale -20m", true, 0},
- {"serve_stale aa", true, 0},
- {"serve_stale 1m nono", true, 0},
+ {"serve_stale 20", true, 0, false},
+ {"serve_stale -20m", true, 0, false},
+ {"serve_stale aa", true, 0, false},
+ {"serve_stale 1m nono", true, 0, false},
+ {"serve_stale 0 after nono", true, 0, false},
}
for i, test := range tests {
c := caddy.NewTestController("dns", fmt.Sprintf("cache {\n%s\n}", test.input))