aboutsummaryrefslogtreecommitdiff
path: root/plugin
diff options
context:
space:
mode:
authorGravatar Miek Gieben <miek@miek.nl> 2018-01-10 11:41:22 +0000
committerGravatar GitHub <noreply@github.com> 2018-01-10 11:41:22 +0000
commit48059a6c3e33f34263c74a09630a85ae885e1178 (patch)
tree8cf9103a034df828ee3d7ee64c7b720e41e2de27 /plugin
parentcced1a4c12b3a6646e186d14ce89bf4f60a632f8 (diff)
downloadcoredns-48059a6c3e33f34263c74a09630a85ae885e1178.tar.gz
coredns-48059a6c3e33f34263c74a09630a85ae885e1178.tar.zst
coredns-48059a6c3e33f34263c74a09630a85ae885e1178.zip
Overloaded (#1364)
* plugin/health: add 'overloaded metrics' Query our on health endpoint and record (and export as a metric) the time it takes. The Get has a 5s timeout, that, when reached, will set the metric duration to 5s. The actually call "I'm I overloaded" is left to an external entity. * README * golint and govet * and the tests
Diffstat (limited to 'plugin')
-rw-r--r--plugin/health/README.md8
-rw-r--r--plugin/health/health.go12
-rw-r--r--plugin/health/health_test.go4
-rw-r--r--plugin/health/overloaded.go52
-rw-r--r--plugin/health/setup.go21
5 files changed, 90 insertions, 7 deletions
diff --git a/plugin/health/README.md b/plugin/health/README.md
index f423e7088..417ad167e 100644
--- a/plugin/health/README.md
+++ b/plugin/health/README.md
@@ -25,6 +25,14 @@ supports health checks has a section "Health" in their README.
Any plugin that implements the Healther interface will be used to report health.
+## Metrics
+
+If monitoring is enabled (via the *prometheus* directive) then the following metric is exported:
+
+* `coredns_health_request_duration_seconds{}` - duration to process a /health query. As this should
+ be a local operation it should be fast. A (large) increases in this duration indicates the
+ CoreDNS process is having trouble keeping up.
+
## Examples
Run another health endpoint on http://localhost:8091.
diff --git a/plugin/health/health.go b/plugin/health/health.go
index 0a256c963..c66f40b00 100644
--- a/plugin/health/health.go
+++ b/plugin/health/health.go
@@ -21,9 +21,11 @@ type health struct {
h []Healther
sync.RWMutex
ok bool // ok is the global boolean indicating an all healthy plugin stack
+
+ stop chan bool
}
-func (h *health) Startup() error {
+func (h *health) OnStartup() error {
if h.Addr == "" {
h.Addr = defAddr
}
@@ -51,14 +53,20 @@ func (h *health) Startup() error {
go func() {
http.Serve(h.ln, h.mux)
}()
+ go func() {
+ h.overloaded()
+ }()
})
return nil
}
-func (h *health) Shutdown() error {
+func (h *health) OnShutdown() error {
if h.ln != nil {
return h.ln.Close()
}
+
+ h.stop <- true
+
return nil
}
diff --git a/plugin/health/health_test.go b/plugin/health/health_test.go
index 276997329..0bfc50f2f 100644
--- a/plugin/health/health_test.go
+++ b/plugin/health/health_test.go
@@ -13,10 +13,10 @@ func TestHealth(t *testing.T) {
h := health{Addr: ":0"}
h.h = append(h.h, &erratic.Erratic{})
- if err := h.Startup(); err != nil {
+ if err := h.OnStartup(); err != nil {
t.Fatalf("Unable to startup the health server: %v", err)
}
- defer h.Shutdown()
+ defer h.OnShutdown()
// Reconstruct the http address based on the port allocated by operating system.
address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path)
diff --git a/plugin/health/overloaded.go b/plugin/health/overloaded.go
new file mode 100644
index 000000000..976cafa84
--- /dev/null
+++ b/plugin/health/overloaded.go
@@ -0,0 +1,52 @@
+package health
+
+import (
+ "net/http"
+ "sync"
+ "time"
+
+ "github.com/coredns/coredns/plugin"
+
+ "github.com/prometheus/client_golang/prometheus"
+)
+
+// overloaded queries the health end point and updates a metrics showing how long it took.
+func (h *health) overloaded() {
+ timeout := time.Duration(5 * time.Second)
+ client := http.Client{
+ Timeout: timeout,
+ }
+ url := "http://" + h.Addr
+ tick := time.NewTicker(1 * time.Second)
+
+ for {
+ select {
+ case <-tick.C:
+ start := time.Now()
+ resp, err := client.Get(url)
+ if err != nil {
+ HealthDuration.Observe(timeout.Seconds())
+ continue
+ }
+ resp.Body.Close()
+ HealthDuration.Observe(time.Since(start).Seconds())
+
+ case <-h.stop:
+ tick.Stop()
+ return
+ }
+ }
+}
+
+var (
+ // HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint.
+ HealthDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
+ Namespace: plugin.Namespace,
+ Subsystem: "health",
+ Name: "request_duration_seconds",
+ Buckets: plugin.TimeBuckets,
+ Help: "Histogram of the time (in seconds) each request took.",
+ })
+)
+
+var onceMetric sync.Once
diff --git a/plugin/health/setup.go b/plugin/health/setup.go
index 12dd29c84..d5285c55f 100644
--- a/plugin/health/setup.go
+++ b/plugin/health/setup.go
@@ -6,6 +6,7 @@ import (
"github.com/coredns/coredns/core/dnsserver"
"github.com/coredns/coredns/plugin"
+ "github.com/coredns/coredns/plugin/metrics"
"github.com/mholt/caddy"
)
@@ -23,7 +24,7 @@ func setup(c *caddy.Controller) error {
return plugin.Error("health", err)
}
- h := &health{Addr: addr}
+ h := &health{Addr: addr, stop: make(chan bool)}
c.OnStartup(func() error {
plugins := dnsserver.GetConfig(c).Handlers()
@@ -36,6 +37,7 @@ func setup(c *caddy.Controller) error {
})
c.OnStartup(func() error {
+ // Poll all middleware every second.
h.poll()
go func() {
for {
@@ -46,8 +48,21 @@ func setup(c *caddy.Controller) error {
return nil
})
- c.OnStartup(h.Startup)
- c.OnFinalShutdown(h.Shutdown)
+ c.OnStartup(func() error {
+ onceMetric.Do(func() {
+ m := dnsserver.GetConfig(c).Handler("prometheus")
+ if m == nil {
+ return
+ }
+ if x, ok := m.(*metrics.Metrics); ok {
+ x.MustRegister(HealthDuration)
+ }
+ })
+ return nil
+ })
+
+ c.OnStartup(h.OnStartup)
+ c.OnFinalShutdown(h.OnShutdown)
// Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running.
return nil