aboutsummaryrefslogtreecommitdiff
path: root/plugin/health/overloaded.go
blob: f8b3256bf60fa9b998abe4d784fdadc02130723a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
package health

import (
	"context"
	"net"
	"net/http"
	"time"

	"github.com/coredns/coredns/plugin"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
)

// overloaded queries the health end point and updates a metrics showing how long it took.
func (h *health) overloaded(ctx context.Context) {
	bypassProxy := &http.Transport{
		Proxy: nil,
		DialContext: (&net.Dialer{
			Timeout:   30 * time.Second,
			KeepAlive: 30 * time.Second,
		}).DialContext,
		ForceAttemptHTTP2:     true,
		MaxIdleConns:          100,
		IdleConnTimeout:       90 * time.Second,
		TLSHandshakeTimeout:   10 * time.Second,
		ExpectContinueTimeout: 1 * time.Second,
	}
	timeout := 3 * time.Second
	client := http.Client{
		Timeout:   timeout,
		Transport: bypassProxy,
	}

	req, _ := http.NewRequestWithContext(ctx, http.MethodGet, h.healthURI.String(), nil)
	tick := time.NewTicker(1 * time.Second)
	defer tick.Stop()

	for {
		select {
		case <-tick.C:
			start := time.Now()
			resp, err := client.Do(req)
			if err != nil && ctx.Err() == context.Canceled {
				// request was cancelled by parent goroutine
				return
			}
			if err != nil {
				HealthDuration.Observe(time.Since(start).Seconds())
				HealthFailures.Inc()
				log.Warningf("Local health request to %q failed: %s", req.URL.String(), err)
				continue
			}
			resp.Body.Close()
			elapsed := time.Since(start)
			HealthDuration.Observe(elapsed.Seconds())
			if elapsed > time.Second { // 1s is pretty random, but a *local* scrape taking that long isn't good
				log.Warningf("Local health request to %q took more than 1s: %s", req.URL.String(), elapsed)
			}

		case <-ctx.Done():
			return
		}
	}
}

var (
	// HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint.
	HealthDuration = promauto.NewHistogram(prometheus.HistogramOpts{
		Namespace:                   plugin.Namespace,
		Subsystem:                   "health",
		Name:                        "request_duration_seconds",
		Buckets:                     plugin.SlimTimeBuckets,
		NativeHistogramBucketFactor: plugin.NativeHistogramBucketFactor,
		Help:                        "Histogram of the time (in seconds) each request took.",
	})
	// HealthFailures is the metric used to count how many times the health request failed
	HealthFailures = promauto.NewCounter(prometheus.CounterOpts{
		Namespace: plugin.Namespace,
		Subsystem: "health",
		Name:      "request_failures_total",
		Help:      "The number of times the health check failed.",
	})
)