aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--plugin/health/README.md8
-rw-r--r--plugin/health/health.go19
-rw-r--r--plugin/health/health_test.go4
-rw-r--r--plugin/health/setup.go3
-rw-r--r--plugin/metrics/addr.go52
-rw-r--r--plugin/metrics/addr_test.go17
-rw-r--r--plugin/metrics/metrics.go39
-rw-r--r--plugin/metrics/metrics_test.go2
-rw-r--r--plugin/metrics/setup.go39
-rw-r--r--plugin/reload/README.md28
-rw-r--r--test/reload_test.go71
11 files changed, 227 insertions, 55 deletions
diff --git a/plugin/health/README.md b/plugin/health/README.md
index f94355649..e547028c2 100644
--- a/plugin/health/README.md
+++ b/plugin/health/README.md
@@ -17,9 +17,9 @@ health [ADDRESS]
~~~
Optionally takes an address; the default is `:8080`. The health path is fixed to `/health`. The
-health endpoint returns a 200 response code and the word "OK" when CoreDNS is healthy. It returns
-a 503. *health* periodically (1s) polls plugin that exports health information. If any of the
-plugin signals that it is unhealthy, the server will go unhealthy too. Each plugin that supports
+health endpoint returns a 200 response code and the word "OK" when this server is healthy. It returns
+a 503. *health* periodically (1s) polls plugins that exports health information. If any of the
+plugins signals that it is unhealthy, the server will go unhealthy too. Each plugin that supports
health checks has a section "Health" in their README.
More options can be set with this extended syntax:
@@ -33,7 +33,7 @@ health [ADDRESS] {
* Where `lameduck` will make the process unhealthy then *wait* for **DURATION** before the process
shuts down.
-If you have multiple Server Block and need to export health for each of the plugins, you must run
+If you have multiple Server Blocks and need to export health for each of the plugins, you must run
health endpoints on different ports:
~~~ corefile
diff --git a/plugin/health/health.go b/plugin/health/health.go
index f6b82804e..782099154 100644
--- a/plugin/health/health.go
+++ b/plugin/health/health.go
@@ -16,8 +16,9 @@ type health struct {
Addr string
lameduck time.Duration
- ln net.Listener
- mux *http.ServeMux
+ ln net.Listener
+ nlSetup bool
+ mux *http.ServeMux
// A slice of Healthers that the health plugin will poll every second for their health status.
h []Healther
@@ -45,6 +46,7 @@ func (h *health) OnStartup() error {
h.ln = ln
h.mux = http.NewServeMux()
+ h.nlSetup = true
h.mux.HandleFunc(path, func(w http.ResponseWriter, r *http.Request) {
if h.Ok() {
@@ -61,7 +63,13 @@ func (h *health) OnStartup() error {
return nil
}
-func (h *health) OnShutdown() error {
+func (h *health) OnRestart() error { return h.OnFinalShutdown() }
+
+func (h *health) OnFinalShutdown() error {
+ if !h.nlSetup {
+ return nil
+ }
+
// Stop polling plugins
h.pollstop <- true
// NACK health
@@ -72,11 +80,10 @@ func (h *health) OnShutdown() error {
time.Sleep(h.lameduck)
}
- if h.ln != nil {
- return h.ln.Close()
- }
+ h.ln.Close()
h.stop <- true
+ h.nlSetup = false
return nil
}
diff --git a/plugin/health/health_test.go b/plugin/health/health_test.go
index 8f325c9c4..acd06ea37 100644
--- a/plugin/health/health_test.go
+++ b/plugin/health/health_test.go
@@ -17,7 +17,7 @@ func TestHealth(t *testing.T) {
if err := h.OnStartup(); err != nil {
t.Fatalf("Unable to startup the health server: %v", err)
}
- defer h.OnShutdown()
+ defer h.OnFinalShutdown()
go func() {
<-h.pollstop
@@ -73,5 +73,5 @@ func TestHealthLameduck(t *testing.T) {
return
}()
- h.OnShutdown()
+ h.OnFinalShutdown()
}
diff --git a/plugin/health/setup.go b/plugin/health/setup.go
index bd5b5c3fc..0b90d829a 100644
--- a/plugin/health/setup.go
+++ b/plugin/health/setup.go
@@ -60,7 +60,8 @@ func setup(c *caddy.Controller) error {
})
c.OnStartup(h.OnStartup)
- c.OnShutdown(h.OnShutdown)
+ c.OnRestart(h.OnRestart)
+ c.OnFinalShutdown(h.OnFinalShutdown)
// Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running.
return nil
diff --git a/plugin/metrics/addr.go b/plugin/metrics/addr.go
new file mode 100644
index 000000000..fe8e5e5fe
--- /dev/null
+++ b/plugin/metrics/addr.go
@@ -0,0 +1,52 @@
+package metrics
+
+// addrs keeps track on which addrs we listen, so we only start one listener, is
+// prometheus is used in multiple Server Blocks.
+type addrs struct {
+ a map[string]value
+}
+
+type value struct {
+ state int
+ f func() error
+}
+
+var uniqAddr addrs
+
+func newAddress() addrs {
+ return addrs{a: make(map[string]value)}
+}
+
+func (a addrs) setAddress(addr string, f func() error) {
+ if a.a[addr].state == done {
+ return
+ }
+ a.a[addr] = value{todo, f}
+}
+
+// setAddressTodo sets addr to 'todo' again.
+func (a addrs) setAddressTodo(addr string) {
+ v, ok := a.a[addr]
+ if !ok {
+ return
+ }
+ v.state = todo
+ a.a[addr] = v
+}
+
+// forEachTodo iterates for a and executes f for each element that is 'todo' and sets it to 'done'.
+func (a addrs) forEachTodo() error {
+ for k, v := range a.a {
+ if v.state == todo {
+ v.f()
+ }
+ v.state = done
+ a.a[k] = v
+ }
+ return nil
+}
+
+const (
+ todo = 1
+ done = 2
+)
diff --git a/plugin/metrics/addr_test.go b/plugin/metrics/addr_test.go
new file mode 100644
index 000000000..d7a08656b
--- /dev/null
+++ b/plugin/metrics/addr_test.go
@@ -0,0 +1,17 @@
+package metrics
+
+import "testing"
+
+func TestForEachTodo(t *testing.T) {
+ a, i := newAddress(), 0
+ a.setAddress("test", func() error { i++; return nil })
+
+ a.forEachTodo()
+ if i != 1 {
+ t.Errorf("Failed to executed f for %s", "test")
+ }
+ a.forEachTodo()
+ if i != 1 {
+ t.Errorf("Executed f twice instead of once")
+ }
+}
diff --git a/plugin/metrics/metrics.go b/plugin/metrics/metrics.go
index d52b6717a..384aa8bb4 100644
--- a/plugin/metrics/metrics.go
+++ b/plugin/metrics/metrics.go
@@ -19,11 +19,12 @@ import (
// Metrics holds the prometheus configuration. The metrics' path is fixed to be /metrics
type Metrics struct {
- Next plugin.Handler
- Addr string
- Reg *prometheus.Registry
- ln net.Listener
- mux *http.ServeMux
+ Next plugin.Handler
+ Addr string
+ Reg *prometheus.Registry
+ ln net.Listener
+ lnSetup bool
+ mux *http.ServeMux
zoneNames []string
zoneMap map[string]bool
@@ -93,7 +94,8 @@ func (m *Metrics) OnStartup() error {
}
m.ln = ln
- ListenAddr = m.ln.Addr().String()
+ m.lnSetup = true
+ ListenAddr = m.ln.Addr().String() // For tests
m.mux = http.NewServeMux()
m.mux.Handle("/metrics", promhttp.HandlerFor(m.Reg, promhttp.HandlerOpts{}))
@@ -104,14 +106,29 @@ func (m *Metrics) OnStartup() error {
return nil
}
-// OnShutdown tears down the metrics listener on shutdown and restart.
-func (m *Metrics) OnShutdown() error {
+// OnRestart stops the listener on reload.
+func (m *Metrics) OnRestart() error {
+ if !m.lnSetup {
+ return nil
+ }
+
+ uniqAddr.setAddressTodo(m.Addr)
+
+ m.ln.Close()
+ m.lnSetup = false
+ return nil
+}
+
+// OnFinalShutdown tears down the metrics listener on shutdown and restart.
+func (m *Metrics) OnFinalShutdown() error {
// We allow prometheus statements in multiple Server Blocks, but only the first
// will open the listener, for the rest they are all nil; guard against that.
- if m.ln != nil {
- return m.ln.Close()
+ if !m.lnSetup {
+ return nil
}
- return nil
+
+ m.lnSetup = false
+ return m.ln.Close()
}
func keys(m map[string]bool) []string {
diff --git a/plugin/metrics/metrics_test.go b/plugin/metrics/metrics_test.go
index 5cdbb448c..bc1e3beb5 100644
--- a/plugin/metrics/metrics_test.go
+++ b/plugin/metrics/metrics_test.go
@@ -18,7 +18,7 @@ func TestMetrics(t *testing.T) {
if err := met.OnStartup(); err != nil {
t.Fatalf("Failed to start metrics handler: %s", err)
}
- defer met.OnShutdown()
+ defer met.OnFinalShutdown()
met.AddZone("example.org.")
diff --git a/plugin/metrics/setup.go b/plugin/metrics/setup.go
index 7dac5ee24..5b605f554 100644
--- a/plugin/metrics/setup.go
+++ b/plugin/metrics/setup.go
@@ -15,7 +15,7 @@ func init() {
Action: setup,
})
- uniqAddr = addrs{a: make(map[string]int)}
+ uniqAddr = newAddress()
}
func setup(c *caddy.Controller) error {
@@ -29,14 +29,15 @@ func setup(c *caddy.Controller) error {
return m
})
- for a, v := range uniqAddr.a {
- if v == todo {
- c.OncePerServerBlock(m.OnStartup)
- }
- uniqAddr.a[a] = done
- }
+ c.OncePerServerBlock(func() error {
+ c.OnStartup(func() error {
+ return uniqAddr.forEachTodo()
+ })
+ return nil
+ })
- c.OnShutdown(m.OnShutdown)
+ c.OnRestart(m.OnRestart)
+ c.OnFinalShutdown(m.OnFinalShutdown)
return nil
}
@@ -45,7 +46,7 @@ func prometheusParse(c *caddy.Controller) (*Metrics, error) {
var met = New(defaultAddr)
defer func() {
- uniqAddr.SetAddress(met.Addr)
+ uniqAddr.setAddress(met.Addr, met.OnStartup)
}()
i := 0
@@ -75,25 +76,5 @@ func prometheusParse(c *caddy.Controller) (*Metrics, error) {
return met, nil
}
-var uniqAddr addrs
-
-// Keep track on which addrs we listen, so we only start one listener.
-type addrs struct {
- a map[string]int
-}
-
-func (a *addrs) SetAddress(addr string) {
- // If already there and set to done, we've already started this listener.
- if a.a[addr] == done {
- return
- }
- a.a[addr] = todo
-}
-
// defaultAddr is the address the where the metrics are exported by default.
const defaultAddr = "localhost:9153"
-
-const (
- todo = 1
- done = 2
-)
diff --git a/plugin/reload/README.md b/plugin/reload/README.md
index 9ebbe2dda..18d40ca5d 100644
--- a/plugin/reload/README.md
+++ b/plugin/reload/README.md
@@ -13,7 +13,8 @@ or SIGUSR1 after changing the Corefile.
The reloads are graceful - you should not see any loss of service when the
reload happens. Even if the new Corefile has an error, CoreDNS will continue
-to run the old config and an error message will be printed to the log.
+to run the old config and an error message will be printed to the log. But see
+the Bugs section for failure modes.
In some environments (for example, Kubernetes), there may be many CoreDNS
instances that started very near the same time and all share a common
@@ -59,3 +60,28 @@ Check every 10 seconds (jitter is automatically set to 10 / 2 = 5 in this case):
erratic
}
~~~
+
+## Bugs
+
+The reload happens without data loss (i.e. DNS queries keep flowing), but there is a corner case
+where the reload fails, and you loose functionality. Consider the following Corefile:
+
+~~~ txt
+. {
+ health :8080
+ whoami
+}
+~~~
+
+CoreDNS starts and serves health from :8080. Now you change `:8080` to `:443` not knowing a process
+is already listening on that port. The process reloads and performs the following steps:
+
+1. close the listener on 8080
+2. reload and parse the config again
+3. fail to start a new listener on 443
+4. fail loading the new Corefile, abort and keep using the old process
+
+After the aborted attempt to reload we are left with the old proceses running, but the listener is
+closed in step 1; so the health endpoint is broken. The same can hopen in the prometheus metrics plugin.
+
+In general be careful with assigning new port and expecting reload to work fully.
diff --git a/test/reload_test.go b/test/reload_test.go
index 24941c6d4..e49628614 100644
--- a/test/reload_test.go
+++ b/test/reload_test.go
@@ -1,6 +1,9 @@
package test
import (
+ "bytes"
+ "io/ioutil"
+ "net/http"
"testing"
"github.com/miekg/dns"
@@ -52,3 +55,71 @@ func send(t *testing.T, server string) {
t.Fatalf("Expected 2 RRs in additional, got %d", len(r.Extra))
}
}
+
+func TestReloadHealth(t *testing.T) {
+ corefile := `
+.:0 {
+ health 127.0.0.1:52182
+ whoami
+}`
+ c, err := CoreDNSServer(corefile)
+ if err != nil {
+ t.Fatalf("Could not get service instance: %s", err)
+ }
+
+ // This fails with address 8080 already in use, it shouldn't.
+ if c1, err := c.Restart(NewInput(corefile)); err != nil {
+ t.Fatal(err)
+ } else {
+ c1.Stop()
+ }
+}
+
+func TestReloadMetricsHealth(t *testing.T) {
+ corefile := `
+.:0 {
+ prometheus 127.0.0.1:53183
+ health 127.0.0.1:53184
+ whoami
+}`
+ c, err := CoreDNSServer(corefile)
+ if err != nil {
+ t.Fatalf("Could not get service instance: %s", err)
+ }
+
+ c1, err := c.Restart(NewInput(corefile))
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer c1.Stop()
+
+ // Send query to trigger monitoring to export on the new process
+ udp, _ := CoreDNSServerPorts(c1, 0)
+ m := new(dns.Msg)
+ m.SetQuestion("example.org.", dns.TypeA)
+ if _, err := dns.Exchange(m, udp); err != nil {
+ t.Fatal(err)
+ }
+
+ // Health
+ resp, err := http.Get("http://localhost:53184/health")
+ if err != nil {
+ t.Fatal(err)
+ }
+ ok, _ := ioutil.ReadAll(resp.Body)
+ resp.Body.Close()
+ if string(ok) != "OK" {
+ t.Errorf("Failed to receive OK, got %s", ok)
+ }
+
+ // Metrics
+ resp, err = http.Get("http://localhost:53183/metrics")
+ if err != nil {
+ t.Fatal(err)
+ }
+ const proc = "process_virtual_memory_bytes"
+ metrics, _ := ioutil.ReadAll(resp.Body)
+ if !bytes.Contains(metrics, []byte(proc)) {
+ t.Errorf("Failed to see %s in metric output", proc)
+ }
+}