6 files changed, 166 insertions, 15 deletions
diff --git a/plugin/cache/README.md b/plugin/cache/README.md
index 2958f90c9..14636e861 100644
--- a/plugin/cache/README.md
+++ b/plugin/cache/README.md
@@ -34,6 +34,7 @@ cache [TTL] [ZONES...] {
     success CAPACITY [TTL] [MINTTL]
     denial CAPACITY [TTL] [MINTTL]
     prefetch AMOUNT [[DURATION] [PERCENTAGE%]]
+    serve_stale [DURATION]
 }
 ~~~
 
@@ -50,6 +51,10 @@ cache [TTL] [ZONES...] {
   **DURATION** defaults to 1m. Prefetching will happen when the TTL drops below **PERCENTAGE**,
   which defaults to `10%`, or latest 1 second before TTL expiration. Values should be in the range `[10%, 90%]`.
   Note the percent sign is mandatory. **PERCENTAGE** is treated as an `int`.
+* `serve_stale`, when serve\_stale is set, cache always will serve an expired entry to a client if there is one
+  available.  When this happens, cache will attempt to refresh the cache entry after sending the expired cache
+  entry to the client. The responses have a TTL of 0. **DURATION** is how far back to consider
+  stale responses as fresh. The default duration is 1h.
 
 ## Capacity and Eviction
 
@@ -69,6 +74,7 @@ If monitoring is enabled (via the *prometheus* plugin) then the following metric
 * `coredns_cache_hits_total{server, type}` - Counter of cache hits by cache type.
 * `coredns_cache_misses_total{server}` - Counter of cache misses.
 * `coredns_cache_drops_total{server}` - Counter of dropped messages.
+* `coredns_cache_served_stale_total{server}` - Counter of requests served from stale cache entries.
 
 Cache types are either "denial" or "success". `Server` is the server handling the request, see the
 metrics plugin for documentation.
diff --git a/plugin/cache/cache.go b/plugin/cache/cache.go
index 43a40c409..6b50c51cc 100644
--- a/plugin/cache/cache.go
+++ b/plugin/cache/cache.go
@@ -36,6 +36,8 @@ type Cache struct {
 	duration   time.Duration
 	percentage int
 
+	staleUpTo time.Duration
+
 	// Testing.
 	now func() time.Time
 }
diff --git a/plugin/cache/cache_test.go b/plugin/cache/cache_test.go
index b32353372..138458c8f 100644
--- a/plugin/cache/cache_test.go
+++ b/plugin/cache/cache_test.go
@@ -2,10 +2,12 @@ package cache
 
 import (
 	"context"
+	"fmt"
 	"testing"
 	"time"
 
 	"github.com/coredns/coredns/plugin"
+	"github.com/coredns/coredns/plugin/pkg/dnstest"
 	"github.com/coredns/coredns/plugin/pkg/response"
 	"github.com/coredns/coredns/plugin/test"
 	"github.com/coredns/coredns/request"
@@ -233,7 +235,7 @@ func TestCacheZeroTTL(t *testing.T) {
 	c := New()
 	c.minpttl = 0
 	c.minnttl = 0
-	c.Next = zeroTTLBackend()
+	c.Next = ttlBackend(0)
 
 	req := new(dns.Msg)
 	req.SetQuestion("example.org.", dns.TypeA)
@@ -248,6 +250,52 @@ func TestCacheZeroTTL(t *testing.T) {
 	}
 }
 
+func TestServeFromStaleCache(t *testing.T) {
+	c := New()
+	c.Next = ttlBackend(60)
+
+	req := new(dns.Msg)
+	req.SetQuestion("cached.org.", dns.TypeA)
+	ctx := context.TODO()
+
+	// Cache example.org.
+	rec := dnstest.NewRecorder(&test.ResponseWriter{})
+	c.staleUpTo = 1 * time.Hour
+	c.ServeDNS(ctx, rec, req)
+	if c.pcache.Len() != 1 {
+		t.Fatalf("Msg with > 0 TTL should have been cached")
+	}
+
+	// No more backend resolutions, just from cache if available.
+	c.Next = plugin.HandlerFunc(func(context.Context, dns.ResponseWriter, *dns.Msg) (int, error) {
+		return 255, nil // Below, a 255 means we tried querying upstream.
+	})
+
+	tests := []struct {
+		name           string
+		futureMinutes  int
+		expectedResult int
+	}{
+		{"cached.org.", 30, 0},
+		{"cached.org.", 60, 0},
+		{"cached.org.", 70, 255},
+
+		{"notcached.org.", 30, 255},
+		{"notcached.org.", 60, 255},
+		{"notcached.org.", 70, 255},
+	}
+
+	for i, tt := range tests {
+		rec := dnstest.NewRecorder(&test.ResponseWriter{})
+		c.now = func() time.Time { return time.Now().Add(time.Duration(tt.futureMinutes) * time.Minute) }
+		r := req.Copy()
+		r.SetQuestion(tt.name, dns.TypeA)
+		if ret, _ := c.ServeDNS(ctx, rec, r); ret != tt.expectedResult {
+			t.Errorf("Test %d: expecting %v; got %v", i, tt.expectedResult, ret)
+		}
+	}
+}
+
 func BenchmarkCacheResponse(b *testing.B) {
 	c := New()
 	c.prefetch = 1
@@ -286,13 +334,13 @@ func BackendHandler() plugin.Handler {
 	})
 }
 
-func zeroTTLBackend() plugin.Handler {
+func ttlBackend(ttl int) plugin.Handler {
 	return plugin.HandlerFunc(func(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
 		m := new(dns.Msg)
 		m.SetReply(r)
 		m.Response, m.RecursionAvailable = true, true
 
-		m.Answer = []dns.RR{test.A("example.org. 0 IN A 127.0.0.53")}
+		m.Answer = []dns.RR{test.A(fmt.Sprintf("example.org. %d IN A 127.0.0.53", ttl))}
 		w.WriteMsg(m)
 		return dns.RcodeSuccess, nil
 	})
diff --git a/plugin/cache/handler.go b/plugin/cache/handler.go
index 4dc29167a..905a98ef4 100644
--- a/plugin/cache/handler.go
+++ b/plugin/cache/handler.go
@@ -26,19 +26,32 @@ func (c *Cache) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg)
 
 	server := metrics.WithServer(ctx)
 
-	i, found := c.get(now, state, server)
-	if i != nil && found {
-		resp := i.toMsg(r, now)
-		w.WriteMsg(resp)
-
-		if c.shouldPrefetch(i, now) {
-			go c.doPrefetch(ctx, state, server, i, now)
-		}
-		return dns.RcodeSuccess, nil
+	ttl := 0
+	i := c.getIgnoreTTL(now, state, server)
+	if i != nil {
+		ttl = i.ttl(now)
+	}
+	if i == nil || -ttl >= int(c.staleUpTo.Seconds()) {
+		crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server}
+		return plugin.NextOrFailure(c.Name(), c.Next, ctx, crr, r)
 	}
+	if ttl < 0 {
+		servedStale.WithLabelValues(server).Inc()
+		// Adjust the time to get a 0 TTL in the reply built from a stale item.
+		now = now.Add(time.Duration(ttl) * time.Second)
+		go func() {
+			r := r.Copy()
+			crr := &ResponseWriter{Cache: c, state: state, server: server, prefetch: true, remoteAddr: w.LocalAddr()}
+			plugin.NextOrFailure(c.Name(), c.Next, ctx, crr, r)
+		}()
+	}
+	resp := i.toMsg(r, now)
+	w.WriteMsg(resp)
 
-	crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server}
-	return plugin.NextOrFailure(c.Name(), c.Next, ctx, crr, r)
+	if c.shouldPrefetch(i, now) {
+		go c.doPrefetch(ctx, state, server, i, now)
+	}
+	return dns.RcodeSuccess, nil
 }
 
 func (c *Cache) doPrefetch(ctx context.Context, state request.Request, server string, i *item, now time.Time) {
@@ -83,6 +96,27 @@ func (c *Cache) get(now time.Time, state request.Request, server string) (*item,
 	return nil, false
 }
 
+// getIgnoreTTL unconditionally returns an item if it exists in the cache.
+func (c *Cache) getIgnoreTTL(now time.Time, state request.Request, server string) *item {
+	k := hash(state.Name(), state.QType(), state.Do())
+
+	if i, ok := c.ncache.Get(k); ok {
+		ttl := i.(*item).ttl(now)
+		if ttl > 0 || (c.staleUpTo > 0 && -ttl < int(c.staleUpTo.Seconds())) {
+			cacheHits.WithLabelValues(server, Denial).Inc()
+		}
+		return i.(*item)
+	}
+	if i, ok := c.pcache.Get(k); ok {
+		ttl := i.(*item).ttl(now)
+		if ttl > 0 || (c.staleUpTo > 0 && -ttl < int(c.staleUpTo.Seconds())) {
+			cacheHits.WithLabelValues(server, Success).Inc()
+		}
+		return i.(*item)
+	}
+	return nil
+}
+
 func (c *Cache) exists(state request.Request) *item {
 	k := hash(state.Name(), state.QType(), state.Do())
 	if i, ok := c.ncache.Get(k); ok {
@@ -129,4 +163,11 @@ var (
 		Name:      "drops_total",
 		Help:      "The number responses that are not cached, because the reply is malformed.",
 	}, []string{"server"})
+
+	servedStale = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: plugin.Namespace,
+		Subsystem: "cache",
+		Name:      "served_stale_total",
+		Help:      "The number of requests served from stale cache entries.",
+	}, []string{"server"})
 )
diff --git a/plugin/cache/setup.go b/plugin/cache/setup.go
index e4ca494db..62c5c9d2c 100644
--- a/plugin/cache/setup.go
+++ b/plugin/cache/setup.go
@@ -1,6 +1,7 @@
 package cache
 
 import (
+	"errors"
 	"fmt"
 	"strconv"
 	"time"
@@ -31,7 +32,7 @@ func setup(c *caddy.Controller) error {
 	c.OnStartup(func() error {
 		metrics.MustRegister(c,
 			cacheSize, cacheHits, cacheMisses,
-			cachePrefetches, cacheDrops)
+			cachePrefetches, cacheDrops, servedStale)
 		return nil
 	})
 
@@ -176,6 +177,22 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
 					ca.percentage = num
 				}
 
+			case "serve_stale":
+				args := c.RemainingArgs()
+				if len(args) > 1 {
+					return nil, c.ArgErr()
+				}
+				ca.staleUpTo = 1 * time.Hour
+				if len(args) == 1 {
+					d, err := time.ParseDuration(args[0])
+					if err != nil {
+						return nil, err
+					}
+					if d < 0 {
+						return nil, errors.New("invalid negative duration for serve_stale")
+					}
+					ca.staleUpTo = d
+				}
 			default:
 				return nil, c.ArgErr()
 			}
diff --git a/plugin/cache/setup_test.go b/plugin/cache/setup_test.go
index 975520d31..6352bcadb 100644
--- a/plugin/cache/setup_test.go
+++ b/plugin/cache/setup_test.go
@@ -1,6 +1,7 @@
 package cache
 
 import (
+	"fmt"
 	"testing"
 	"time"
 
@@ -113,3 +114,39 @@ func TestSetup(t *testing.T) {
 		}
 	}
 }
+
+func TestServeStale(t *testing.T) {
+	tests := []struct {
+		input     string
+		shouldErr bool
+		staleUpTo time.Duration
+	}{
+		{"serve_stale", false, 1 * time.Hour},
+		{"serve_stale 20m", false, 20 * time.Minute},
+		{"serve_stale 1h20m", false, 80 * time.Minute},
+		{"serve_stale 0m", false, 0},
+		{"serve_stale 0", false, 0},
+		// fails
+		{"serve_stale 20", true, 0},
+		{"serve_stale -20m", true, 0},
+		{"serve_stale aa", true, 0},
+		{"serve_stale 1m nono", true, 0},
+	}
+	for i, test := range tests {
+		c := caddy.NewTestController("dns", fmt.Sprintf("cache {\n%s\n}", test.input))
+		ca, err := cacheParse(c)
+		if test.shouldErr && err == nil {
+			t.Errorf("Test %v: Expected error but found nil", i)
+			continue
+		} else if !test.shouldErr && err != nil {
+			t.Errorf("Test %v: Expected no error but found error: %v", i, err)
+			continue
+		}
+		if test.shouldErr && err != nil {
+			continue
+		}
+		if ca.staleUpTo != test.staleUpTo {
+			t.Errorf("Test %v: Expected stale %v but found: %v", i, test.staleUpTo, ca.staleUpTo)
+		}
+	}
+}