Skip to content

Commit 6424093

Browse files
authored
feat: add prebuilds reconciliation duration metric (coder#20535) (coder#20581)
Related to PR: coder#20535 (cherry picked from commit aad1b40)
1 parent 2cf4b5c commit 6424093

File tree

5 files changed

+125
-31
lines changed

5 files changed

+125
-31
lines changed

coderd/prebuilds/api.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,18 @@ type ReconciliationOrchestrator interface {
3737
TrackResourceReplacement(ctx context.Context, workspaceID, buildID uuid.UUID, replacements []*sdkproto.ResourceReplacement)
3838
}
3939

40+
// ReconcileStats contains statistics about a reconciliation cycle.
41+
type ReconcileStats struct {
42+
Elapsed time.Duration
43+
}
44+
4045
type Reconciler interface {
4146
StateSnapshotter
4247

4348
// ReconcileAll orchestrates the reconciliation of all prebuilds across all templates.
4449
// It takes a global snapshot of the system state and then reconciles each preset
4550
// in parallel, creating or deleting prebuilds as needed to reach their desired states.
46-
ReconcileAll(ctx context.Context) error
51+
ReconcileAll(ctx context.Context) (ReconcileStats, error)
4752
}
4853

4954
// StateSnapshotter defines the operations necessary to capture workspace prebuilds state.

coderd/prebuilds/noop.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@ func (NoopReconciler) Run(context.Context) {}
1717
func (NoopReconciler) Stop(context.Context, error) {}
1818
func (NoopReconciler) TrackResourceReplacement(context.Context, uuid.UUID, uuid.UUID, []*sdkproto.ResourceReplacement) {
1919
}
20-
func (NoopReconciler) ReconcileAll(context.Context) error { return nil }
20+
21+
func (NoopReconciler) ReconcileAll(context.Context) (ReconcileStats, error) {
22+
return ReconcileStats{}, nil
23+
}
24+
2125
func (NoopReconciler) SnapshotState(context.Context, database.Store) (*GlobalSnapshot, error) {
2226
return &GlobalSnapshot{}, nil
2327
}

enterprise/coderd/prebuilds/metricscollector_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ func TestMetricsCollector_ReconciliationPausedMetric(t *testing.T) {
485485
require.NoError(t, err)
486486

487487
// Run reconciliation to update the metric
488-
err = reconciler.ReconcileAll(ctx)
488+
_, err = reconciler.ReconcileAll(ctx)
489489
require.NoError(t, err)
490490

491491
// Check that the metric shows reconciliation is not paused
@@ -514,7 +514,7 @@ func TestMetricsCollector_ReconciliationPausedMetric(t *testing.T) {
514514
require.NoError(t, err)
515515

516516
// Run reconciliation to update the metric
517-
err = reconciler.ReconcileAll(ctx)
517+
_, err = reconciler.ReconcileAll(ctx)
518518
require.NoError(t, err)
519519

520520
// Check that the metric shows reconciliation is paused
@@ -543,7 +543,7 @@ func TestMetricsCollector_ReconciliationPausedMetric(t *testing.T) {
543543
require.NoError(t, err)
544544

545545
// Run reconciliation to update the metric
546-
err = reconciler.ReconcileAll(ctx)
546+
_, err = reconciler.ReconcileAll(ctx)
547547
require.NoError(t, err)
548548

549549
// Check that the metric shows reconciliation is not paused

enterprise/coderd/prebuilds/reconcile.go

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/google/uuid"
1616
"github.com/hashicorp/go-multierror"
1717
"github.com/prometheus/client_golang/prometheus"
18+
"github.com/prometheus/client_golang/prometheus/promauto"
1819
"golang.org/x/sync/errgroup"
1920
"golang.org/x/xerrors"
2021

@@ -44,7 +45,6 @@ type StoreReconciler struct {
4445
logger slog.Logger
4546
clock quartz.Clock
4647
registerer prometheus.Registerer
47-
metrics *MetricsCollector
4848
notifEnq notifications.Enqueuer
4949
buildUsageChecker *atomic.Pointer[wsbuilder.UsageChecker]
5050

@@ -53,6 +53,11 @@ type StoreReconciler struct {
5353
stopped atomic.Bool
5454
done chan struct{}
5555
provisionNotifyCh chan database.ProvisionerJob
56+
57+
// Prebuild state metrics
58+
metrics *MetricsCollector
59+
// Operational metrics
60+
reconciliationDuration prometheus.Histogram
5661
}
5762

5863
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
@@ -105,6 +110,15 @@ func NewStoreReconciler(store database.Store,
105110
// If the registerer fails to register the metrics collector, it's not fatal.
106111
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
107112
}
113+
114+
factory := promauto.With(registerer)
115+
reconciler.reconciliationDuration = factory.NewHistogram(prometheus.HistogramOpts{
116+
Namespace: "coderd",
117+
Subsystem: "prebuilds",
118+
Name: "reconciliation_duration_seconds",
119+
Help: "Duration of each prebuilds reconciliation cycle.",
120+
Buckets: prometheus.DefBuckets,
121+
})
108122
}
109123

110124
return reconciler
@@ -176,10 +190,15 @@ func (c *StoreReconciler) Run(ctx context.Context) {
176190
// instead of waiting for the next reconciliation interval
177191
case <-ticker.C:
178192
// Trigger a new iteration on each tick.
179-
err := c.ReconcileAll(ctx)
193+
stats, err := c.ReconcileAll(ctx)
180194
if err != nil {
181195
c.logger.Error(context.Background(), "reconciliation failed", slog.Error(err))
182196
}
197+
198+
if c.reconciliationDuration != nil {
199+
c.reconciliationDuration.Observe(stats.Elapsed.Seconds())
200+
}
201+
c.logger.Debug(ctx, "reconciliation stats", slog.F("elapsed", stats.Elapsed))
183202
case <-ctx.Done():
184203
// nolint:gocritic // it's okay to use slog.F() for an error in this case
185204
// because we want to differentiate two different types of errors: ctx.Err() and context.Cause()
@@ -263,19 +282,24 @@ func (c *StoreReconciler) Stop(ctx context.Context, cause error) {
263282
// be reconciled again, leading to another workspace being provisioned. Two workspace builds will be occurring
264283
// simultaneously for the same preset, but once both jobs have completed the reconciliation loop will notice the
265284
// extraneous instance and delete it.
266-
func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
285+
func (c *StoreReconciler) ReconcileAll(ctx context.Context) (stats prebuilds.ReconcileStats, err error) {
286+
start := c.clock.Now()
287+
defer func() {
288+
stats.Elapsed = c.clock.Since(start)
289+
}()
290+
267291
logger := c.logger.With(slog.F("reconcile_context", "all"))
268292

269293
select {
270294
case <-ctx.Done():
271295
logger.Warn(context.Background(), "reconcile exiting prematurely; context done", slog.Error(ctx.Err()))
272-
return nil
296+
return stats, nil
273297
default:
274298
}
275299

276300
logger.Debug(ctx, "starting reconciliation")
277301

278-
err := c.WithReconciliationLock(ctx, logger, func(ctx context.Context, _ database.Store) error {
302+
err = c.WithReconciliationLock(ctx, logger, func(ctx context.Context, _ database.Store) error {
279303
// Check if prebuilds reconciliation is paused
280304
settingsJSON, err := c.store.GetPrebuildsSettings(ctx)
281305
if err != nil {
@@ -348,7 +372,7 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error {
348372
logger.Error(ctx, "failed to reconcile", slog.Error(err))
349373
}
350374

351-
return err
375+
return stats, err
352376
}
353377

354378
func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSnapshot) {

enterprise/coderd/prebuilds/reconcile_test.go

Lines changed: 81 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ func TestNoReconciliationActionsIfNoPresets(t *testing.T) {
7272
require.Equal(t, templateVersion, gotTemplateVersion)
7373

7474
// when we trigger the reconciliation loop for all templates
75-
require.NoError(t, controller.ReconcileAll(ctx))
75+
_, err = controller.ReconcileAll(ctx)
76+
require.NoError(t, err)
7677

7778
// then no reconciliation actions are taken
7879
// because without presets, there are no prebuilds
@@ -126,7 +127,8 @@ func TestNoReconciliationActionsIfNoPrebuilds(t *testing.T) {
126127
require.NotEmpty(t, presetParameters)
127128

128129
// when we trigger the reconciliation loop for all templates
129-
require.NoError(t, controller.ReconcileAll(ctx))
130+
_, err = controller.ReconcileAll(ctx)
131+
require.NoError(t, err)
130132

131133
// then no reconciliation actions are taken
132134
// because without prebuilds, there is nothing to reconcile
@@ -428,7 +430,8 @@ func (tc testCase) run(t *testing.T) {
428430
// Run the reconciliation multiple times to ensure idempotency
429431
// 8 was arbitrary, but large enough to reasonably trust the result
430432
for i := 1; i <= 8; i++ {
431-
require.NoErrorf(t, controller.ReconcileAll(ctx), "failed on iteration %d", i)
433+
_, err := controller.ReconcileAll(ctx)
434+
require.NoErrorf(t, err, "failed on iteration %d", i)
432435

433436
if tc.shouldCreateNewPrebuild != nil {
434437
newPrebuildCount := 0
@@ -542,7 +545,8 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
542545
// Run the reconciliation multiple times to ensure idempotency
543546
// 8 was arbitrary, but large enough to reasonably trust the result
544547
for i := 1; i <= 8; i++ {
545-
require.NoErrorf(t, controller.ReconcileAll(ctx), "failed on iteration %d", i)
548+
_, err := controller.ReconcileAll(ctx)
549+
require.NoErrorf(t, err, "failed on iteration %d", i)
546550

547551
newPrebuildCount := 0
548552
workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID)
@@ -668,7 +672,7 @@ func TestPrebuildScheduling(t *testing.T) {
668672
DesiredInstances: 5,
669673
})
670674

671-
err := controller.ReconcileAll(ctx)
675+
_, err := controller.ReconcileAll(ctx)
672676
require.NoError(t, err)
673677

674678
// get workspace builds
@@ -751,7 +755,8 @@ func TestInvalidPreset(t *testing.T) {
751755
// Run the reconciliation multiple times to ensure idempotency
752756
// 8 was arbitrary, but large enough to reasonably trust the result
753757
for i := 1; i <= 8; i++ {
754-
require.NoErrorf(t, controller.ReconcileAll(ctx), "failed on iteration %d", i)
758+
_, err := controller.ReconcileAll(ctx)
759+
require.NoErrorf(t, err, "failed on iteration %d", i)
755760

756761
workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID)
757762
require.NoError(t, err)
@@ -817,7 +822,8 @@ func TestDeletionOfPrebuiltWorkspaceWithInvalidPreset(t *testing.T) {
817822
})
818823

819824
// Old prebuilt workspace should be deleted.
820-
require.NoError(t, controller.ReconcileAll(ctx))
825+
_, err = controller.ReconcileAll(ctx)
826+
require.NoError(t, err)
821827

822828
builds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{
823829
WorkspaceID: prebuiltWorkspace.ID,
@@ -916,12 +922,15 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
916922

917923
// Trigger reconciliation to attempt creating a new prebuild.
918924
// The outcome depends on whether the hard limit has been reached.
919-
require.NoError(t, controller.ReconcileAll(ctx))
925+
_, err = controller.ReconcileAll(ctx)
926+
require.NoError(t, err)
920927

921928
// These two additional calls to ReconcileAll should not trigger any notifications.
922929
// A notification is only sent once.
923-
require.NoError(t, controller.ReconcileAll(ctx))
924-
require.NoError(t, controller.ReconcileAll(ctx))
930+
_, err = controller.ReconcileAll(ctx)
931+
require.NoError(t, err)
932+
_, err = controller.ReconcileAll(ctx)
933+
require.NoError(t, err)
925934

926935
// Verify the final state after reconciliation.
927936
workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
@@ -1093,12 +1102,15 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
10931102

10941103
// Trigger reconciliation to attempt creating a new prebuild.
10951104
// The outcome depends on whether the hard limit has been reached.
1096-
require.NoError(t, controller.ReconcileAll(ctx))
1105+
_, err = controller.ReconcileAll(ctx)
1106+
require.NoError(t, err)
10971107

10981108
// These two additional calls to ReconcileAll should not trigger any notifications.
10991109
// A notification is only sent once.
1100-
require.NoError(t, controller.ReconcileAll(ctx))
1101-
require.NoError(t, controller.ReconcileAll(ctx))
1110+
_, err = controller.ReconcileAll(ctx)
1111+
require.NoError(t, err)
1112+
_, err = controller.ReconcileAll(ctx)
1113+
require.NoError(t, err)
11021114

11031115
// Verify the final state after reconciliation.
11041116
// When hard limit is reached, no new workspace should be created.
@@ -1141,7 +1153,8 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) {
11411153
}
11421154

11431155
// Trigger reconciliation to make sure that successful, but outdated prebuilt workspace will be deleted.
1144-
require.NoError(t, controller.ReconcileAll(ctx))
1156+
_, err = controller.ReconcileAll(ctx)
1157+
require.NoError(t, err)
11451158

11461159
workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
11471160
require.NoError(t, err)
@@ -1740,7 +1753,8 @@ func TestExpiredPrebuildsMultipleActions(t *testing.T) {
17401753
}
17411754

17421755
// Trigger reconciliation to process expired prebuilds and enforce desired state.
1743-
require.NoError(t, controller.ReconcileAll(ctx))
1756+
_, err = controller.ReconcileAll(ctx)
1757+
require.NoError(t, err)
17441758

17451759
// Sort non-expired workspaces by CreatedAt in ascending order (oldest first)
17461760
sort.Slice(nonExpiredWorkspaces, func(i, j int) bool {
@@ -2145,7 +2159,8 @@ func TestCancelPendingPrebuilds(t *testing.T) {
21452159
require.NoError(t, err)
21462160

21472161
// When: the reconciliation loop is triggered
2148-
require.NoError(t, reconciler.ReconcileAll(ctx))
2162+
_, err = reconciler.ReconcileAll(ctx)
2163+
require.NoError(t, err)
21492164

21502165
if tt.shouldCancel {
21512166
// Then: the pending prebuild job from non-active version should be canceled
@@ -2347,7 +2362,8 @@ func TestCancelPendingPrebuilds(t *testing.T) {
23472362
templateBVersion3Pending := setupPrebuilds(t, db, owner.OrganizationID, templateBID, templateBVersion3ID, templateBVersion3PresetID, 1, true)
23482363

23492364
// When: the reconciliation loop is executed
2350-
require.NoError(t, reconciler.ReconcileAll(ctx))
2365+
_, err := reconciler.ReconcileAll(ctx)
2366+
require.NoError(t, err)
23512367

23522368
// Then: template A version 1 running workspaces should not be canceled
23532369
checkIfJobCanceledAndDeleted(t, clock, ctx, db, false, templateAVersion1Running)
@@ -2369,6 +2385,51 @@ func TestCancelPendingPrebuilds(t *testing.T) {
23692385
})
23702386
}
23712387

2388+
func TestReconciliationStats(t *testing.T) {
2389+
t.Parallel()
2390+
2391+
// Setup
2392+
clock := quartz.NewReal()
2393+
db, ps := dbtestutil.NewDB(t)
2394+
client, _, _ := coderdtest.NewWithAPI(t, &coderdtest.Options{
2395+
Database: db,
2396+
Pubsub: ps,
2397+
Clock: clock,
2398+
})
2399+
fakeEnqueuer := newFakeEnqueuer()
2400+
registry := prometheus.NewRegistry()
2401+
cache := files.New(registry, &coderdtest.FakeAuthorizer{})
2402+
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: false}).Leveled(slog.LevelDebug)
2403+
reconciler := prebuilds.NewStoreReconciler(db, ps, cache, codersdk.PrebuildsConfig{}, logger, clock, registry, fakeEnqueuer, newNoopUsageCheckerPtr())
2404+
owner := coderdtest.CreateFirstUser(t, client)
2405+
2406+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
2407+
defer cancel()
2408+
2409+
// Create a template version with a preset
2410+
dbfake.TemplateVersion(t, db).Seed(database.TemplateVersion{
2411+
OrganizationID: owner.OrganizationID,
2412+
CreatedBy: owner.UserID,
2413+
}).Preset(database.TemplateVersionPreset{
2414+
DesiredInstances: sql.NullInt32{
2415+
Int32: 1,
2416+
Valid: true,
2417+
},
2418+
}).Do()
2419+
2420+
// Verify that ReconcileAll tracks and returns elapsed time
2421+
start := time.Now()
2422+
stats, err := reconciler.ReconcileAll(ctx)
2423+
actualElapsed := time.Since(start)
2424+
require.NoError(t, err)
2425+
require.Greater(t, stats.Elapsed, time.Duration(0))
2426+
2427+
// Verify stats.Elapsed matches actual execution time
2428+
require.InDelta(t, actualElapsed.Milliseconds(), stats.Elapsed.Milliseconds(), 100)
2429+
// Verify reconciliation loop is not unexpectedly slow
2430+
require.Less(t, stats.Elapsed, 5*time.Second)
2431+
}
2432+
23722433
func newNoopEnqueuer() *notifications.NoopEnqueuer {
23732434
return notifications.NewNoopEnqueuer()
23742435
}
@@ -2863,7 +2924,7 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) {
28632924
_ = setupTestDBPreset(t, db, templateVersionID, 2, "test")
28642925

28652926
// Initially, reconciliation should create prebuilds
2866-
err := reconciler.ReconcileAll(ctx)
2927+
_, err := reconciler.ReconcileAll(ctx)
28672928
require.NoError(t, err)
28682929

28692930
// Verify that prebuilds were created
@@ -2890,7 +2951,7 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) {
28902951
require.Len(t, workspaces, 0, "prebuilds should be deleted")
28912952

28922953
// Run reconciliation again - it should be paused and not recreate prebuilds
2893-
err = reconciler.ReconcileAll(ctx)
2954+
_, err = reconciler.ReconcileAll(ctx)
28942955
require.NoError(t, err)
28952956

28962957
// Verify that no new prebuilds were created because reconciliation is paused
@@ -2903,7 +2964,7 @@ func TestReconciliationRespectsPauseSetting(t *testing.T) {
29032964
require.NoError(t, err)
29042965

29052966
// Run reconciliation again - it should now recreate the prebuilds
2906-
err = reconciler.ReconcileAll(ctx)
2967+
_, err = reconciler.ReconcileAll(ctx)
29072968
require.NoError(t, err)
29082969

29092970
// Verify that prebuilds were recreated

0 commit comments

Comments
 (0)