tasks scheduling: metrics: enhance TasksWaiting metric.

The `TasksWaiting` metric is now assigned labels for:

 * Team ID

 * Worker tags: when multiple tags are defined they are joined

 * Platform

of the tasks that are waiting to be executed.

Since one of the most interesting aspect of the metric is
to be able to AutoScale the workers instances, the added
labels allow for a more specific scaling since they
differentiate the groups the workers might belong to.

Also add metric to track the amount of time the tasks spend
waiting to be executed.

Signed-off-by: Alessandro Degano <alessandro.degano@pix4d.com>
This commit is contained in:
Alessandro Degano 2020-07-29 14:40:55 +02:00
parent 6ab1954b70
commit e3e38f01cb
8 changed files with 130 additions and 25 deletions

View File

@ -27,7 +27,8 @@ type PrometheusEmitter struct {
concurrentRequestsLimitHit *prometheus.CounterVec
concurrentRequests *prometheus.GaugeVec
tasksWaiting prometheus.Gauge
tasksWaiting *prometheus.GaugeVec
tasksWaitingDuration *prometheus.HistogramVec
buildDurationsVec *prometheus.HistogramVec
buildsAborted prometheus.Counter
@ -171,14 +172,23 @@ func (config *PrometheusConfig) NewEmitter() (metric.Emitter, error) {
}, []string{"action"})
prometheus.MustRegister(concurrentRequests)
tasksWaiting := prometheus.NewGauge(prometheus.GaugeOpts{
tasksWaiting := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "concourse",
Subsystem: "tasks",
Name: "waiting",
Help: "Number of Concourse tasks currently waiting.",
})
}, []string{"teamId", "workerTags", "platform"})
prometheus.MustRegister(tasksWaiting)
tasksWaitingDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "concourse",
Subsystem: "tasks",
Name: "wait_duration",
Help: "Elapsed time waiting for execution",
Buckets: []float64{1, 15, 30, 60, 120, 180, 240, 300, 600, 1200},
}, []string{"teamId", "workerTags", "platform"})
prometheus.MustRegister(tasksWaitingDuration)
buildsFinished := prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "concourse",
Subsystem: "builds",
@ -398,7 +408,8 @@ func (config *PrometheusConfig) NewEmitter() (metric.Emitter, error) {
concurrentRequestsLimitHit: concurrentRequestsLimitHit,
concurrentRequests: concurrentRequests,
tasksWaiting: tasksWaiting,
tasksWaiting: tasksWaiting,
tasksWaitingDuration: tasksWaitingDuration,
buildDurationsVec: buildDurationsVec,
buildsAborted: buildsAborted,
@ -463,9 +474,22 @@ func (emitter *PrometheusEmitter) Emit(logger lager.Logger, event metric.Event)
case "concurrent requests limit hit":
emitter.concurrentRequestsLimitHit.WithLabelValues(event.Attributes["action"]).Add(event.Value)
case "concurrent requests":
emitter.concurrentRequests.WithLabelValues(event.Attributes["action"]).Set(event.Value)
emitter.concurrentRequests.
WithLabelValues(event.Attributes["action"]).Set(event.Value)
case "tasks waiting":
emitter.tasksWaiting.Set(event.Value)
emitter.tasksWaiting.
WithLabelValues(
event.Attributes["teamId"],
event.Attributes["workerTags"],
event.Attributes["platform"],
).Set(event.Value)
case "tasks waiting duration":
emitter.tasksWaitingDuration.
WithLabelValues(
event.Attributes["teamId"],
event.Attributes["workerTags"],
event.Attributes["platform"],
).Observe(event.Value)
case "build finished":
emitter.buildFinishedMetrics(logger, event)
case "worker containers":

View File

@ -187,7 +187,6 @@ var _ = Describe("PrometheusEmitter", func() {
}
})
JustBeforeEach(func() {
prometheusEmitter, err = prometheusConfig.NewEmitter()
})
@ -196,6 +195,11 @@ var _ = Describe("PrometheusEmitter", func() {
prometheusEmitter.Emit(logger, metric.Event{
Name: "tasks waiting",
Value: 4,
Attributes: map[string]string{
"teamId": "42",
"workerTags": "tester",
"platform": "darwin",
},
})
res, _ := http.Get(fmt.Sprintf("http://%s:%s/metrics", prometheusConfig.BindIP, prometheusConfig.BindPort))
@ -203,7 +207,7 @@ var _ = Describe("PrometheusEmitter", func() {
body, _ := ioutil.ReadAll(res.Body)
Expect(res.StatusCode).To(Equal(http.StatusOK))
Expect(string(body)).To(ContainSubstring("concourse_tasks_waiting 4"))
Expect(string(body)).To(ContainSubstring("concourse_tasks_waiting{platform=\"darwin\",teamId=\"42\",workerTags=\"tester\"} 4"))
Expect(err).To(BeNil())
})
})

View File

@ -30,8 +30,6 @@ var JobsScheduling = &Gauge{}
var BuildsStarted = &Counter{}
var BuildsRunning = &Gauge{}
var TasksWaiting = &Gauge{}
var ChecksFinishedWithError = &Counter{}
var ChecksFinishedWithSuccess = &Counter{}
var ChecksQueueSize = &Gauge{}
@ -41,6 +39,34 @@ var ChecksEnqueued = &Counter{}
var ConcurrentRequests = map[string]*Gauge{}
var ConcurrentRequestsLimitHit = map[string]*Counter{}
type TasksWaitingLabels struct {
TeamId string
WorkerTags string
Platform string
}
var TasksWaiting = map[TasksWaitingLabels]*Gauge{}
type TasksWaitingDuration struct {
Labels TasksWaitingLabels
Duration time.Duration
}
func (event TasksWaitingDuration) Emit(logger lager.Logger) {
emit(
logger.Session("tasks-waiting-duration"),
Event{
Name: "tasks waiting duration",
Value: event.Duration.Seconds(),
Attributes: map[string]string{
"teamId": event.Labels.TeamId,
"workerTags": event.Labels.WorkerTags,
"platform": event.Labels.Platform,
},
},
)
}
type BuildCollectorDuration struct {
Duration time.Duration
}

View File

@ -165,13 +165,20 @@ func tick(logger lager.Logger) {
)
}
emit(
logger.Session("tasks-waiting"),
Event{
Name: "tasks waiting",
Value: TasksWaiting.Max(),
},
)
for labels, gauge := range TasksWaiting {
emit(
logger.Session("tasks-waiting"),
Event{
Name: "tasks waiting",
Value: gauge.Max(),
Attributes: map[string]string{
"teamId": labels.TeamId,
"workerTags": labels.WorkerTags,
"platform": labels.Platform,
},
},
)
}
emit(
logger.Session("checks-finished-with-error"),

View File

@ -142,10 +142,17 @@ var _ = Describe("Periodic emission of metrics", func() {
})
Context("limit-active-tasks metrics", func() {
labels := metric.TasksWaitingLabels{
TeamId: "42",
WorkerTags: "tester",
Platform: "darwin",
}
BeforeEach(func() {
gauge := &metric.Gauge{}
gauge.Set(123)
metric.TasksWaiting = gauge
metric.TasksWaiting[labels] = gauge
})
It("emits", func() {
Eventually(emitter.EmitCallCount).Should(BeNumerically(">=", 1))
@ -155,6 +162,11 @@ var _ = Describe("Periodic emission of metrics", func() {
MatchFields(IgnoreExtras, Fields{
"Name": Equal("tasks waiting"),
"Value": Equal(float64(123)),
"Attributes": Equal(map[string]string{
"teamId": labels.TeamId,
"workerTags": labels.WorkerTags,
"platform": labels.Platform,
}),
}),
),
),

View File

@ -2,11 +2,14 @@ package worker_test
import (
"bytes"
"code.cloudfoundry.org/garden"
"context"
"github.com/concourse/concourse/atc/metric"
"strconv"
"strings"
"time"
"code.cloudfoundry.org/garden"
"github.com/concourse/concourse/atc/metric"
"code.cloudfoundry.org/lager/lagertest"
"github.com/concourse/concourse/atc"
@ -166,8 +169,15 @@ var _ = Describe("RunTaskStep", func() {
})
It("task waiting metrics is gauged", func() {
Eventually(metric.TasksWaiting.Max(), 2*time.Second).Should(Equal(float64(1)))
Eventually(metric.TasksWaiting.Max(), 2*time.Second).Should(Equal(float64(0)))
labels := metric.TasksWaitingLabels{
TeamId: strconv.Itoa(fakeWorkerSpec.TeamID),
WorkerTags: strings.Join(fakeContainerSpec.Tags, "_"),
Platform: fakeWorkerSpec.Platform,
}
// Verify that when one task is waiting the gauge is increased...
Eventually(metric.TasksWaiting[labels].Max(), 2*time.Second).Should(Equal(float64(1)))
// and then decreased.
Eventually(metric.TasksWaiting[labels].Max(), 2*time.Second).Should(Equal(float64(0)))
})
It("writes status to output writer", func() {

View File

@ -7,6 +7,7 @@ import (
"io"
"path"
"strconv"
"strings"
"time"
"code.cloudfoundry.org/garden"
@ -586,6 +587,12 @@ func (client *client) chooseTaskWorker(
workerStatusPublishTicker := time.NewTicker(client.workerStatusPublishInterval)
defer workerStatusPublishTicker.Stop()
tasksWaitingLabels := metric.TasksWaitingLabels{
TeamId: strconv.Itoa(workerSpec.TeamID),
WorkerTags: strings.Join(containerSpec.Tags, "_"),
Platform: workerSpec.Platform,
}
for {
if chosenWorker, err = client.pool.FindOrChooseWorkerForContainer(
ctx,
@ -631,6 +638,10 @@ func (client *client) chooseTaskWorker(
if elapsed > 0 {
message := fmt.Sprintf("Found a free worker after waiting %s.\n", elapsed.Round(1*time.Second))
writeOutputMessage(logger, outputWriter, message)
metric.TasksWaitingDuration{
Labels: tasksWaitingLabels,
Duration: elapsed,
}.Emit(logger)
}
return chosenWorker, err
@ -643,8 +654,12 @@ func (client *client) chooseTaskWorker(
// Increase task waiting only once
if elapsed == 0 {
metric.TasksWaiting.Inc()
defer metric.TasksWaiting.Dec()
_, ok := metric.TasksWaiting[tasksWaitingLabels]
if !ok {
metric.TasksWaiting[tasksWaitingLabels] = &metric.Gauge{}
}
metric.TasksWaiting[tasksWaitingLabels].Inc()
defer metric.TasksWaiting[tasksWaitingLabels].Dec()
}
elapsed = waitForWorker(logger,

View File

@ -25,4 +25,11 @@
#### <sub><sup><a name="5146" href="#5146">:link:</a></sup></sub> feature
* Refactor TSA to use Concourse's gclient which has a configurable timeout Issue: #5146 PR: #5845
* Refactor TSA to use Concourse's gclient which has a configurable timeout Issue: #5146 PR: #5845
#### <sub><sup><a name="5981" href="#5981">:link:</a></sup></sub> feature
* Enhance `task_waiting` metric to export labels in Prometheus for: platform, worker tags and team of the tasks awaiting execution.
A new metric called `tasks_wait_duration_bucket` is also added to express as quantiles the average time spent by tasks awaiting execution. PR: #5981
![Example graph for the task wait time histograms.](https://user-images.githubusercontent.com/40891147/89990749-189d2600-dc83-11ea-8fde-ae579fdb0a0a.png)