Skip to content

Commit a7b3efb

Browse files
authored
feat: delete pending canceled prebuilds (coder#20499) (coder#20554)
Related to PR: coder#20499 (cherry picked from commit c3e3bb5)
1 parent 0b5542f commit a7b3efb

File tree

9 files changed

+219
-95
lines changed

9 files changed

+219
-95
lines changed

coderd/database/dbauthz/dbauthz.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4934,10 +4934,10 @@ func (q *querier) UpdateOrganizationDeletedByID(ctx context.Context, arg databas
49344934
return deleteQ(q.log, q.auth, q.db.GetOrganizationByID, deleteF)(ctx, arg.ID)
49354935
}
49364936

4937-
func (q *querier) UpdatePrebuildProvisionerJobWithCancel(ctx context.Context, arg database.UpdatePrebuildProvisionerJobWithCancelParams) ([]uuid.UUID, error) {
4937+
func (q *querier) UpdatePrebuildProvisionerJobWithCancel(ctx context.Context, arg database.UpdatePrebuildProvisionerJobWithCancelParams) ([]database.UpdatePrebuildProvisionerJobWithCancelRow, error) {
49384938
// Prebuild operation for canceling pending prebuild jobs from non-active template versions
49394939
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourcePrebuiltWorkspace); err != nil {
4940-
return []uuid.UUID{}, err
4940+
return []database.UpdatePrebuildProvisionerJobWithCancelRow{}, err
49414941
}
49424942
return q.db.UpdatePrebuildProvisionerJobWithCancel(ctx, arg)
49434943
}

coderd/database/dbauthz/dbauthz_test.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -646,10 +646,13 @@ func (s *MethodTestSuite) TestProvisionerJob() {
646646
PresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true},
647647
Now: dbtime.Now(),
648648
}
649-
jobIDs := []uuid.UUID{uuid.New(), uuid.New()}
649+
canceledJobs := []database.UpdatePrebuildProvisionerJobWithCancelRow{
650+
{ID: uuid.New(), WorkspaceID: uuid.New(), TemplateID: uuid.New(), TemplateVersionPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}},
651+
{ID: uuid.New(), WorkspaceID: uuid.New(), TemplateID: uuid.New(), TemplateVersionPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}},
652+
}
650653

651-
dbm.EXPECT().UpdatePrebuildProvisionerJobWithCancel(gomock.Any(), arg).Return(jobIDs, nil).AnyTimes()
652-
check.Args(arg).Asserts(rbac.ResourcePrebuiltWorkspace, policy.ActionUpdate).Returns(jobIDs)
654+
dbm.EXPECT().UpdatePrebuildProvisionerJobWithCancel(gomock.Any(), arg).Return(canceledJobs, nil).AnyTimes()
655+
check.Args(arg).Asserts(rbac.ResourcePrebuiltWorkspace, policy.ActionUpdate).Returns(canceledJobs)
653656
}))
654657
s.Run("GetProvisionerJobsByIDs", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
655658
org := testutil.Fake(s.T(), faker, database.Organization{})

coderd/database/dbmetrics/querymetrics.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/dbmock/dbmock.go

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/querier.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

Lines changed: 26 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/prebuilds.sql

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -300,12 +300,8 @@ GROUP BY wpb.template_version_preset_id;
300300
-- Cancels all pending provisioner jobs for prebuilt workspaces on a specific preset from an
301301
-- inactive template version.
302302
-- This is an optimization to clean up stale pending jobs.
303-
UPDATE provisioner_jobs
304-
SET
305-
canceled_at = @now::timestamptz,
306-
completed_at = @now::timestamptz
307-
WHERE id IN (
308-
SELECT pj.id
303+
WITH jobs_to_cancel AS (
304+
SELECT pj.id, w.id AS workspace_id, w.template_id, wpb.template_version_preset_id
309305
FROM provisioner_jobs pj
310306
INNER JOIN workspace_prebuild_builds wpb ON wpb.job_id = pj.id
311307
INNER JOIN workspaces w ON w.id = wpb.workspace_id
@@ -324,4 +320,10 @@ WHERE id IN (
324320
AND pj.canceled_at IS NULL
325321
AND pj.completed_at IS NULL
326322
)
327-
RETURNING id;
323+
UPDATE provisioner_jobs
324+
SET
325+
canceled_at = @now::timestamptz,
326+
completed_at = @now::timestamptz
327+
FROM jobs_to_cancel
328+
WHERE provisioner_jobs.id = jobs_to_cancel.id
329+
RETURNING jobs_to_cancel.id, jobs_to_cancel.workspace_id, jobs_to_cancel.template_id, jobs_to_cancel.template_version_preset_id;

enterprise/coderd/prebuilds/reconcile.go

Lines changed: 104 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,24 @@ type StoreReconciler struct {
5757

5858
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
5959

60+
type DeprovisionMode int
61+
62+
const (
63+
DeprovisionModeNormal DeprovisionMode = iota
64+
DeprovisionModeOrphan
65+
)
66+
67+
func (d DeprovisionMode) String() string {
68+
switch d {
69+
case DeprovisionModeOrphan:
70+
return "orphan"
71+
case DeprovisionModeNormal:
72+
return "normal"
73+
default:
74+
return "unknown"
75+
}
76+
}
77+
6078
func NewStoreReconciler(store database.Store,
6179
ps pubsub.Pubsub,
6280
fileCache *files.Cache,
@@ -642,34 +660,7 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
642660
return multiErr.ErrorOrNil()
643661

644662
case prebuilds.ActionTypeCancelPending:
645-
// Cancel pending prebuild jobs from non-active template versions to avoid
646-
// provisioning obsolete workspaces that would immediately be deprovisioned.
647-
// This uses a criteria-based update to ensure only jobs that are still pending
648-
// at execution time are canceled, avoiding race conditions where jobs may have
649-
// transitioned to running status between query and update.
650-
canceledJobs, err := c.store.UpdatePrebuildProvisionerJobWithCancel(
651-
ctx,
652-
database.UpdatePrebuildProvisionerJobWithCancelParams{
653-
Now: c.clock.Now(),
654-
PresetID: uuid.NullUUID{
655-
UUID: ps.Preset.ID,
656-
Valid: true,
657-
},
658-
})
659-
if err != nil {
660-
logger.Error(ctx, "failed to cancel pending prebuild jobs",
661-
slog.F("template_version_id", ps.Preset.TemplateVersionID.String()),
662-
slog.F("preset_id", ps.Preset.ID),
663-
slog.Error(err))
664-
return err
665-
}
666-
if len(canceledJobs) > 0 {
667-
logger.Info(ctx, "canceled pending prebuild jobs for inactive version",
668-
slog.F("template_version_id", ps.Preset.TemplateVersionID.String()),
669-
slog.F("preset_id", ps.Preset.ID),
670-
slog.F("count", len(canceledJobs)))
671-
}
672-
return nil
663+
return c.cancelAndOrphanDeletePendingPrebuilds(ctx, ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID)
673664

674665
default:
675666
return xerrors.Errorf("unknown action type: %v", action.ActionType)
@@ -717,33 +708,100 @@ func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltW
717708
c.logger.Info(ctx, "attempting to create prebuild", slog.F("name", name),
718709
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
719710

720-
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace)
711+
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace, DeprovisionModeNormal)
721712
}, &database.TxOptions{
722713
Isolation: sql.LevelRepeatableRead,
723714
ReadOnly: false,
724715
})
725716
}
726717

727-
func (c *StoreReconciler) deletePrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
718+
// provisionDelete provisions a delete transition for a prebuilt workspace.
719+
//
720+
// If mode is DeprovisionModeOrphan, the builder will not send Terraform state to the provisioner.
721+
// This allows the workspace to be deleted even when no provisioners are available, and is safe
722+
// when no Terraform resources were actually created (e.g., for pending prebuilds that were canceled
723+
// before provisioning started).
724+
//
725+
// IMPORTANT: This function must be called within a database transaction. It does not create its own transaction.
726+
// The caller is responsible for managing the transaction boundary via db.InTx().
727+
func (c *StoreReconciler) provisionDelete(ctx context.Context, db database.Store, workspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID, mode DeprovisionMode) error {
728+
workspace, err := db.GetWorkspaceByID(ctx, workspaceID)
729+
if err != nil {
730+
return xerrors.Errorf("get workspace by ID: %w", err)
731+
}
732+
733+
template, err := db.GetTemplateByID(ctx, templateID)
734+
if err != nil {
735+
return xerrors.Errorf("failed to get template: %w", err)
736+
}
737+
738+
if workspace.OwnerID != database.PrebuildsSystemUserID {
739+
return xerrors.Errorf("prebuilt workspace is not owned by prebuild user anymore, probably it was claimed")
740+
}
741+
742+
c.logger.Info(ctx, "attempting to delete prebuild", slog.F("orphan", mode.String()),
743+
slog.F("name", workspace.Name), slog.F("workspace_id", workspaceID.String()), slog.F("preset_id", presetID.String()))
744+
745+
return c.provision(ctx, db, workspaceID, template, presetID,
746+
database.WorkspaceTransitionDelete, workspace, mode)
747+
}
748+
749+
// cancelAndOrphanDeletePendingPrebuilds cancels pending prebuild jobs from inactive template versions
750+
// and orphan-deletes their associated workspaces.
751+
//
752+
// The cancel operation uses a criteria-based update to ensure only jobs that are still pending at
753+
// execution time are canceled, avoiding race conditions where jobs may have transitioned to running.
754+
//
755+
// Since these jobs were never processed by a provisioner, no Terraform resources were created,
756+
// making it safe to orphan-delete the workspaces (skipping Terraform destroy).
757+
func (c *StoreReconciler) cancelAndOrphanDeletePendingPrebuilds(ctx context.Context, templateID uuid.UUID, templateVersionID uuid.UUID, presetID uuid.UUID) error {
728758
return c.store.InTx(func(db database.Store) error {
729-
workspace, err := db.GetWorkspaceByID(ctx, prebuiltWorkspaceID)
759+
canceledJobs, err := db.UpdatePrebuildProvisionerJobWithCancel(
760+
ctx,
761+
database.UpdatePrebuildProvisionerJobWithCancelParams{
762+
Now: c.clock.Now(),
763+
PresetID: uuid.NullUUID{
764+
UUID: presetID,
765+
Valid: true,
766+
},
767+
})
730768
if err != nil {
731-
return xerrors.Errorf("get workspace by ID: %w", err)
769+
c.logger.Error(ctx, "failed to cancel pending prebuild jobs",
770+
slog.F("template_id", templateID.String()),
771+
slog.F("template_version_id", templateVersionID.String()),
772+
slog.F("preset_id", presetID.String()),
773+
slog.Error(err))
774+
return err
732775
}
733776

734-
template, err := db.GetTemplateByID(ctx, templateID)
735-
if err != nil {
736-
return xerrors.Errorf("failed to get template: %w", err)
777+
if len(canceledJobs) > 0 {
778+
c.logger.Info(ctx, "canceled pending prebuild jobs for inactive version",
779+
slog.F("template_id", templateID.String()),
780+
slog.F("template_version_id", templateVersionID.String()),
781+
slog.F("preset_id", presetID.String()),
782+
slog.F("count", len(canceledJobs)))
737783
}
738784

739-
if workspace.OwnerID != database.PrebuildsSystemUserID {
740-
return xerrors.Errorf("prebuilt workspace is not owned by prebuild user anymore, probably it was claimed")
785+
var multiErr multierror.Error
786+
for _, job := range canceledJobs {
787+
err = c.provisionDelete(ctx, db, job.WorkspaceID, job.TemplateID, presetID, DeprovisionModeOrphan)
788+
if err != nil {
789+
c.logger.Error(ctx, "failed to orphan delete canceled prebuild",
790+
slog.F("workspace_id", job.WorkspaceID.String()), slog.Error(err))
791+
multiErr.Errors = append(multiErr.Errors, err)
792+
}
741793
}
742794

743-
c.logger.Info(ctx, "attempting to delete prebuild",
744-
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
795+
return multiErr.ErrorOrNil()
796+
}, &database.TxOptions{
797+
Isolation: sql.LevelRepeatableRead,
798+
ReadOnly: false,
799+
})
800+
}
745801

746-
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionDelete, workspace)
802+
func (c *StoreReconciler) deletePrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
803+
return c.store.InTx(func(db database.Store) error {
804+
return c.provisionDelete(ctx, db, prebuiltWorkspaceID, templateID, presetID, DeprovisionModeNormal)
747805
}, &database.TxOptions{
748806
Isolation: sql.LevelRepeatableRead,
749807
ReadOnly: false,
@@ -758,6 +816,7 @@ func (c *StoreReconciler) provision(
758816
presetID uuid.UUID,
759817
transition database.WorkspaceTransition,
760818
workspace database.Workspace,
819+
mode DeprovisionMode,
761820
) error {
762821
tvp, err := db.GetPresetParametersByTemplateVersionID(ctx, template.ActiveVersionID)
763822
if err != nil {
@@ -795,6 +854,11 @@ func (c *StoreReconciler) provision(
795854
builder = builder.RichParameterValues(params)
796855
}
797856

857+
// Use orphan mode for deletes when no Terraform resources exist
858+
if transition == database.WorkspaceTransitionDelete && mode == DeprovisionModeOrphan {
859+
builder = builder.Orphan()
860+
}
861+
798862
_, provisionerJob, _, err := builder.Build(
799863
ctx,
800864
db,

0 commit comments

Comments
 (0)