aboutsummaryrefslogtreecommitdiff
path: root/plugin
diff options
context:
space:
mode:
Diffstat (limited to 'plugin')
-rw-r--r--plugin/health/README.md6
-rw-r--r--plugin/health/overloaded.go12
-rw-r--r--plugin/plugin.go3
3 files changed, 17 insertions, 4 deletions
diff --git a/plugin/health/README.md b/plugin/health/README.md
index ce2446a21..c8fda61a4 100644
--- a/plugin/health/README.md
+++ b/plugin/health/README.md
@@ -50,11 +50,13 @@ Doing this is supported but both endpoints ":8080" and ":8081" will export the e
If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported:
- * `coredns_health_request_duration_seconds{}` - duration to process a HTTP query to the local
+ * `coredns_health_request_duration_seconds{}` - duration to process a HTTP query to the local
`/health` endpoint. As this a local operation it should be fast. A (large) increase in this
duration indicates the CoreDNS process is having trouble keeping up with its query load.
+ * `coredns_health_request_failures_total{}` - The number of times the internal health check loop
+ failed to query `/health`.
-Note that this metric *does not* have a `server` label, because being overloaded is a symptom of
+Note that these metrics *do not* have a `server` label, because being overloaded is a symptom of
the running process, *not* a specific server.
## Examples
diff --git a/plugin/health/overloaded.go b/plugin/health/overloaded.go
index d996827c0..3a4c5f08b 100644
--- a/plugin/health/overloaded.go
+++ b/plugin/health/overloaded.go
@@ -26,7 +26,8 @@ func (h *health) overloaded() {
start := time.Now()
resp, err := client.Get(url)
if err != nil {
- HealthDuration.Observe(timeout.Seconds())
+ HealthDuration.Observe(time.Since(start).Seconds())
+ HealthFailures.Inc()
log.Warningf("Local health request to %q failed: %s", url, err)
continue
}
@@ -49,7 +50,14 @@ var (
Namespace: plugin.Namespace,
Subsystem: "health",
Name: "request_duration_seconds",
- Buckets: plugin.TimeBuckets,
+ Buckets: plugin.SlimTimeBuckets,
Help: "Histogram of the time (in seconds) each request took.",
})
+ // HealthFailures is the metric used to count how many times the thealth request failed
+ HealthFailures = promauto.NewCounter(prometheus.CounterOpts{
+ Namespace: plugin.Namespace,
+ Subsystem: "health",
+ Name: "request_failures_total",
+ Help: "The number of times the health check failed.",
+ })
)
diff --git a/plugin/plugin.go b/plugin/plugin.go
index 9bac48885..51f5ba79c 100644
--- a/plugin/plugin.go
+++ b/plugin/plugin.go
@@ -105,5 +105,8 @@ const Namespace = "coredns"
// TimeBuckets is based on Prometheus client_golang prometheus.DefBuckets
var TimeBuckets = prometheus.ExponentialBuckets(0.00025, 2, 16) // from 0.25ms to 8 seconds
+// SlimTimeBuckets is low cardinality set of duration buckets.
+var SlimTimeBuckets = prometheus.ExponentialBuckets(0.00025, 10, 5) // from 0.25ms to 2.5 seconds
+
// ErrOnce is returned when a plugin doesn't support multiple setups per server.
var ErrOnce = errors.New("this plugin can only be used once per Server Block")