diff options
author | 2018-01-10 11:41:22 +0000 | |
---|---|---|
committer | 2018-01-10 11:41:22 +0000 | |
commit | 48059a6c3e33f34263c74a09630a85ae885e1178 (patch) | |
tree | 8cf9103a034df828ee3d7ee64c7b720e41e2de27 /plugin | |
parent | cced1a4c12b3a6646e186d14ce89bf4f60a632f8 (diff) | |
download | coredns-48059a6c3e33f34263c74a09630a85ae885e1178.tar.gz coredns-48059a6c3e33f34263c74a09630a85ae885e1178.tar.zst coredns-48059a6c3e33f34263c74a09630a85ae885e1178.zip |
Overloaded (#1364)
* plugin/health: add 'overloaded metrics'
Query our on health endpoint and record (and export as a metric) the
time it takes. The Get has a 5s timeout, that, when reached, will set
the metric duration to 5s. The actually call "I'm I overloaded" is left
to an external entity.
* README
* golint and govet
* and the tests
Diffstat (limited to 'plugin')
-rw-r--r-- | plugin/health/README.md | 8 | ||||
-rw-r--r-- | plugin/health/health.go | 12 | ||||
-rw-r--r-- | plugin/health/health_test.go | 4 | ||||
-rw-r--r-- | plugin/health/overloaded.go | 52 | ||||
-rw-r--r-- | plugin/health/setup.go | 21 |
5 files changed, 90 insertions, 7 deletions
diff --git a/plugin/health/README.md b/plugin/health/README.md index f423e7088..417ad167e 100644 --- a/plugin/health/README.md +++ b/plugin/health/README.md @@ -25,6 +25,14 @@ supports health checks has a section "Health" in their README. Any plugin that implements the Healther interface will be used to report health. +## Metrics + +If monitoring is enabled (via the *prometheus* directive) then the following metric is exported: + +* `coredns_health_request_duration_seconds{}` - duration to process a /health query. As this should + be a local operation it should be fast. A (large) increases in this duration indicates the + CoreDNS process is having trouble keeping up. + ## Examples Run another health endpoint on http://localhost:8091. diff --git a/plugin/health/health.go b/plugin/health/health.go index 0a256c963..c66f40b00 100644 --- a/plugin/health/health.go +++ b/plugin/health/health.go @@ -21,9 +21,11 @@ type health struct { h []Healther sync.RWMutex ok bool // ok is the global boolean indicating an all healthy plugin stack + + stop chan bool } -func (h *health) Startup() error { +func (h *health) OnStartup() error { if h.Addr == "" { h.Addr = defAddr } @@ -51,14 +53,20 @@ func (h *health) Startup() error { go func() { http.Serve(h.ln, h.mux) }() + go func() { + h.overloaded() + }() }) return nil } -func (h *health) Shutdown() error { +func (h *health) OnShutdown() error { if h.ln != nil { return h.ln.Close() } + + h.stop <- true + return nil } diff --git a/plugin/health/health_test.go b/plugin/health/health_test.go index 276997329..0bfc50f2f 100644 --- a/plugin/health/health_test.go +++ b/plugin/health/health_test.go @@ -13,10 +13,10 @@ func TestHealth(t *testing.T) { h := health{Addr: ":0"} h.h = append(h.h, &erratic.Erratic{}) - if err := h.Startup(); err != nil { + if err := h.OnStartup(); err != nil { t.Fatalf("Unable to startup the health server: %v", err) } - defer h.Shutdown() + defer h.OnShutdown() // Reconstruct the http address based on the port allocated by operating system. address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path) diff --git a/plugin/health/overloaded.go b/plugin/health/overloaded.go new file mode 100644 index 000000000..976cafa84 --- /dev/null +++ b/plugin/health/overloaded.go @@ -0,0 +1,52 @@ +package health + +import ( + "net/http" + "sync" + "time" + + "github.com/coredns/coredns/plugin" + + "github.com/prometheus/client_golang/prometheus" +) + +// overloaded queries the health end point and updates a metrics showing how long it took. +func (h *health) overloaded() { + timeout := time.Duration(5 * time.Second) + client := http.Client{ + Timeout: timeout, + } + url := "http://" + h.Addr + tick := time.NewTicker(1 * time.Second) + + for { + select { + case <-tick.C: + start := time.Now() + resp, err := client.Get(url) + if err != nil { + HealthDuration.Observe(timeout.Seconds()) + continue + } + resp.Body.Close() + HealthDuration.Observe(time.Since(start).Seconds()) + + case <-h.stop: + tick.Stop() + return + } + } +} + +var ( + // HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint. + HealthDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: plugin.Namespace, + Subsystem: "health", + Name: "request_duration_seconds", + Buckets: plugin.TimeBuckets, + Help: "Histogram of the time (in seconds) each request took.", + }) +) + +var onceMetric sync.Once diff --git a/plugin/health/setup.go b/plugin/health/setup.go index 12dd29c84..d5285c55f 100644 --- a/plugin/health/setup.go +++ b/plugin/health/setup.go @@ -6,6 +6,7 @@ import ( "github.com/coredns/coredns/core/dnsserver" "github.com/coredns/coredns/plugin" + "github.com/coredns/coredns/plugin/metrics" "github.com/mholt/caddy" ) @@ -23,7 +24,7 @@ func setup(c *caddy.Controller) error { return plugin.Error("health", err) } - h := &health{Addr: addr} + h := &health{Addr: addr, stop: make(chan bool)} c.OnStartup(func() error { plugins := dnsserver.GetConfig(c).Handlers() @@ -36,6 +37,7 @@ func setup(c *caddy.Controller) error { }) c.OnStartup(func() error { + // Poll all middleware every second. h.poll() go func() { for { @@ -46,8 +48,21 @@ func setup(c *caddy.Controller) error { return nil }) - c.OnStartup(h.Startup) - c.OnFinalShutdown(h.Shutdown) + c.OnStartup(func() error { + onceMetric.Do(func() { + m := dnsserver.GetConfig(c).Handler("prometheus") + if m == nil { + return + } + if x, ok := m.(*metrics.Metrics); ok { + x.MustRegister(HealthDuration) + } + }) + return nil + }) + + c.OnStartup(h.OnStartup) + c.OnFinalShutdown(h.OnShutdown) // Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running. return nil |