Skip to content

Commit c3e3bb5

Browse files
authored
feat: delete pending canceled prebuilds (coder#20499)
## Description PR coder#20387 introduced canceling pending prebuild jobs from inactive template versions to avoid provisioning obsolete workspaces. However, the associated prebuilds remained in the database with "Canceled" status, visible in the UI. This PR now orphan-deletes these canceled prebuilt workspaces. Since the canceled jobs were never processed by a provisioner, no Terraform resources were created, making orphan deletion safe. Orphan deletion always creates a provisioner job, but behaves differently based on provisioner availability: - If no provisioner daemon is available, the job is immediately marked as completed and the workspace is marked as deleted without any provisioner processing - If a provisioner daemon is available, it processes the delete job with empty Terraform state (no actual resources to destroy) The job cancellation and workspace deletion occur atomically in the same transaction. We don't split this into two separate reconciliation runs because there's no way to distinguish between system-canceled prebuilds and user-canceled workspaces. If we deleted canceled workspaces in a later run, we'd delete user-canceled workspaces that users may want to keep for troubleshooting. Note: This only applies to system-generated prebuilds from inactive template versions. ## Changes * Update `UpdatePrebuildProvisionerJobWithCancel` query to return job ID, workspace ID, template ID, and template version preset ID * Add `DeprovisionMode` enum to support orphan deletion in the provision flow * Update `ActionTypeCancelPending` handler to cancel jobs and orphan-delete associated workspaces atomically
1 parent 0d765f5 commit c3e3bb5

File tree

9 files changed

+219
-95
lines changed

9 files changed

+219
-95
lines changed

coderd/database/dbauthz/dbauthz.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4933,10 +4933,10 @@ func (q *querier) UpdateOrganizationDeletedByID(ctx context.Context, arg databas
49334933
return deleteQ(q.log, q.auth, q.db.GetOrganizationByID, deleteF)(ctx, arg.ID)
49344934
}
49354935

4936-
func (q *querier) UpdatePrebuildProvisionerJobWithCancel(ctx context.Context, arg database.UpdatePrebuildProvisionerJobWithCancelParams) ([]uuid.UUID, error) {
4936+
func (q *querier) UpdatePrebuildProvisionerJobWithCancel(ctx context.Context, arg database.UpdatePrebuildProvisionerJobWithCancelParams) ([]database.UpdatePrebuildProvisionerJobWithCancelRow, error) {
49374937
// Prebuild operation for canceling pending prebuild jobs from non-active template versions
49384938
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourcePrebuiltWorkspace); err != nil {
4939-
return []uuid.UUID{}, err
4939+
return []database.UpdatePrebuildProvisionerJobWithCancelRow{}, err
49404940
}
49414941
return q.db.UpdatePrebuildProvisionerJobWithCancel(ctx, arg)
49424942
}

coderd/database/dbauthz/dbauthz_test.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -646,10 +646,13 @@ func (s *MethodTestSuite) TestProvisionerJob() {
646646
PresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true},
647647
Now: dbtime.Now(),
648648
}
649-
jobIDs := []uuid.UUID{uuid.New(), uuid.New()}
649+
canceledJobs := []database.UpdatePrebuildProvisionerJobWithCancelRow{
650+
{ID: uuid.New(), WorkspaceID: uuid.New(), TemplateID: uuid.New(), TemplateVersionPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}},
651+
{ID: uuid.New(), WorkspaceID: uuid.New(), TemplateID: uuid.New(), TemplateVersionPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}},
652+
}
650653

651-
dbm.EXPECT().UpdatePrebuildProvisionerJobWithCancel(gomock.Any(), arg).Return(jobIDs, nil).AnyTimes()
652-
check.Args(arg).Asserts(rbac.ResourcePrebuiltWorkspace, policy.ActionUpdate).Returns(jobIDs)
654+
dbm.EXPECT().UpdatePrebuildProvisionerJobWithCancel(gomock.Any(), arg).Return(canceledJobs, nil).AnyTimes()
655+
check.Args(arg).Asserts(rbac.ResourcePrebuiltWorkspace, policy.ActionUpdate).Returns(canceledJobs)
653656
}))
654657
s.Run("GetProvisionerJobsByIDs", s.Mocked(func(dbm *dbmock.MockStore, faker *gofakeit.Faker, check *expects) {
655658
org := testutil.Fake(s.T(), faker, database.Organization{})

coderd/database/dbmetrics/querymetrics.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/dbmock/dbmock.go

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/querier.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

Lines changed: 26 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/prebuilds.sql

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -300,12 +300,8 @@ GROUP BY wpb.template_version_preset_id;
300300
-- Cancels all pending provisioner jobs for prebuilt workspaces on a specific preset from an
301301
-- inactive template version.
302302
-- This is an optimization to clean up stale pending jobs.
303-
UPDATE provisioner_jobs
304-
SET
305-
canceled_at = @now::timestamptz,
306-
completed_at = @now::timestamptz
307-
WHERE id IN (
308-
SELECT pj.id
303+
WITH jobs_to_cancel AS (
304+
SELECT pj.id, w.id AS workspace_id, w.template_id, wpb.template_version_preset_id
309305
FROM provisioner_jobs pj
310306
INNER JOIN workspace_prebuild_builds wpb ON wpb.job_id = pj.id
311307
INNER JOIN workspaces w ON w.id = wpb.workspace_id
@@ -324,4 +320,10 @@ WHERE id IN (
324320
AND pj.canceled_at IS NULL
325321
AND pj.completed_at IS NULL
326322
)
327-
RETURNING id;
323+
UPDATE provisioner_jobs
324+
SET
325+
canceled_at = @now::timestamptz,
326+
completed_at = @now::timestamptz
327+
FROM jobs_to_cancel
328+
WHERE provisioner_jobs.id = jobs_to_cancel.id
329+
RETURNING jobs_to_cancel.id, jobs_to_cancel.workspace_id, jobs_to_cancel.template_id, jobs_to_cancel.template_version_preset_id;

enterprise/coderd/prebuilds/reconcile.go

Lines changed: 104 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,24 @@ type StoreReconciler struct {
5757

5858
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
5959

60+
type DeprovisionMode int
61+
62+
const (
63+
DeprovisionModeNormal DeprovisionMode = iota
64+
DeprovisionModeOrphan
65+
)
66+
67+
func (d DeprovisionMode) String() string {
68+
switch d {
69+
case DeprovisionModeOrphan:
70+
return "orphan"
71+
case DeprovisionModeNormal:
72+
return "normal"
73+
default:
74+
return "unknown"
75+
}
76+
}
77+
6078
func NewStoreReconciler(store database.Store,
6179
ps pubsub.Pubsub,
6280
fileCache *files.Cache,
@@ -642,34 +660,7 @@ func (c *StoreReconciler) executeReconciliationAction(ctx context.Context, logge
642660
return multiErr.ErrorOrNil()
643661

644662
case prebuilds.ActionTypeCancelPending:
645-
// Cancel pending prebuild jobs from non-active template versions to avoid
646-
// provisioning obsolete workspaces that would immediately be deprovisioned.
647-
// This uses a criteria-based update to ensure only jobs that are still pending
648-
// at execution time are canceled, avoiding race conditions where jobs may have
649-
// transitioned to running status between query and update.
650-
canceledJobs, err := c.store.UpdatePrebuildProvisionerJobWithCancel(
651-
ctx,
652-
database.UpdatePrebuildProvisionerJobWithCancelParams{
653-
Now: c.clock.Now(),
654-
PresetID: uuid.NullUUID{
655-
UUID: ps.Preset.ID,
656-
Valid: true,
657-
},
658-
})
659-
if err != nil {
660-
logger.Error(ctx, "failed to cancel pending prebuild jobs",
661-
slog.F("template_version_id", ps.Preset.TemplateVersionID.String()),
662-
slog.F("preset_id", ps.Preset.ID),
663-
slog.Error(err))
664-
return err
665-
}
666-
if len(canceledJobs) > 0 {
667-
logger.Info(ctx, "canceled pending prebuild jobs for inactive version",
668-
slog.F("template_version_id", ps.Preset.TemplateVersionID.String()),
669-
slog.F("preset_id", ps.Preset.ID),
670-
slog.F("count", len(canceledJobs)))
671-
}
672-
return nil
663+
return c.cancelAndOrphanDeletePendingPrebuilds(ctx, ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID)
673664

674665
default:
675666
return xerrors.Errorf("unknown action type: %v", action.ActionType)
@@ -717,33 +708,100 @@ func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltW
717708
c.logger.Info(ctx, "attempting to create prebuild", slog.F("name", name),
718709
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
719710

720-
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace)
711+
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace, DeprovisionModeNormal)
721712
}, &database.TxOptions{
722713
Isolation: sql.LevelRepeatableRead,
723714
ReadOnly: false,
724715
})
725716
}
726717

727-
func (c *StoreReconciler) deletePrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
718+
// provisionDelete provisions a delete transition for a prebuilt workspace.
719+
//
720+
// If mode is DeprovisionModeOrphan, the builder will not send Terraform state to the provisioner.
721+
// This allows the workspace to be deleted even when no provisioners are available, and is safe
722+
// when no Terraform resources were actually created (e.g., for pending prebuilds that were canceled
723+
// before provisioning started).
724+
//
725+
// IMPORTANT: This function must be called within a database transaction. It does not create its own transaction.
726+
// The caller is responsible for managing the transaction boundary via db.InTx().
727+
func (c *StoreReconciler) provisionDelete(ctx context.Context, db database.Store, workspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID, mode DeprovisionMode) error {
728+
workspace, err := db.GetWorkspaceByID(ctx, workspaceID)
729+
if err != nil {
730+
return xerrors.Errorf("get workspace by ID: %w", err)
731+
}
732+
733+
template, err := db.GetTemplateByID(ctx, templateID)
734+
if err != nil {
735+
return xerrors.Errorf("failed to get template: %w", err)
736+
}
737+
738+
if workspace.OwnerID != database.PrebuildsSystemUserID {
739+
return xerrors.Errorf("prebuilt workspace is not owned by prebuild user anymore, probably it was claimed")
740+
}
741+
742+
c.logger.Info(ctx, "attempting to delete prebuild", slog.F("orphan", mode.String()),
743+
slog.F("name", workspace.Name), slog.F("workspace_id", workspaceID.String()), slog.F("preset_id", presetID.String()))
744+
745+
return c.provision(ctx, db, workspaceID, template, presetID,
746+
database.WorkspaceTransitionDelete, workspace, mode)
747+
}
748+
749+
// cancelAndOrphanDeletePendingPrebuilds cancels pending prebuild jobs from inactive template versions
750+
// and orphan-deletes their associated workspaces.
751+
//
752+
// The cancel operation uses a criteria-based update to ensure only jobs that are still pending at
753+
// execution time are canceled, avoiding race conditions where jobs may have transitioned to running.
754+
//
755+
// Since these jobs were never processed by a provisioner, no Terraform resources were created,
756+
// making it safe to orphan-delete the workspaces (skipping Terraform destroy).
757+
func (c *StoreReconciler) cancelAndOrphanDeletePendingPrebuilds(ctx context.Context, templateID uuid.UUID, templateVersionID uuid.UUID, presetID uuid.UUID) error {
728758
return c.store.InTx(func(db database.Store) error {
729-
workspace, err := db.GetWorkspaceByID(ctx, prebuiltWorkspaceID)
759+
canceledJobs, err := db.UpdatePrebuildProvisionerJobWithCancel(
760+
ctx,
761+
database.UpdatePrebuildProvisionerJobWithCancelParams{
762+
Now: c.clock.Now(),
763+
PresetID: uuid.NullUUID{
764+
UUID: presetID,
765+
Valid: true,
766+
},
767+
})
730768
if err != nil {
731-
return xerrors.Errorf("get workspace by ID: %w", err)
769+
c.logger.Error(ctx, "failed to cancel pending prebuild jobs",
770+
slog.F("template_id", templateID.String()),
771+
slog.F("template_version_id", templateVersionID.String()),
772+
slog.F("preset_id", presetID.String()),
773+
slog.Error(err))
774+
return err
732775
}
733776

734-
template, err := db.GetTemplateByID(ctx, templateID)
735-
if err != nil {
736-
return xerrors.Errorf("failed to get template: %w", err)
777+
if len(canceledJobs) > 0 {
778+
c.logger.Info(ctx, "canceled pending prebuild jobs for inactive version",
779+
slog.F("template_id", templateID.String()),
780+
slog.F("template_version_id", templateVersionID.String()),
781+
slog.F("preset_id", presetID.String()),
782+
slog.F("count", len(canceledJobs)))
737783
}
738784

739-
if workspace.OwnerID != database.PrebuildsSystemUserID {
740-
return xerrors.Errorf("prebuilt workspace is not owned by prebuild user anymore, probably it was claimed")
785+
var multiErr multierror.Error
786+
for _, job := range canceledJobs {
787+
err = c.provisionDelete(ctx, db, job.WorkspaceID, job.TemplateID, presetID, DeprovisionModeOrphan)
788+
if err != nil {
789+
c.logger.Error(ctx, "failed to orphan delete canceled prebuild",
790+
slog.F("workspace_id", job.WorkspaceID.String()), slog.Error(err))
791+
multiErr.Errors = append(multiErr.Errors, err)
792+
}
741793
}
742794

743-
c.logger.Info(ctx, "attempting to delete prebuild",
744-
slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String()))
795+
return multiErr.ErrorOrNil()
796+
}, &database.TxOptions{
797+
Isolation: sql.LevelRepeatableRead,
798+
ReadOnly: false,
799+
})
800+
}
745801

746-
return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionDelete, workspace)
802+
func (c *StoreReconciler) deletePrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error {
803+
return c.store.InTx(func(db database.Store) error {
804+
return c.provisionDelete(ctx, db, prebuiltWorkspaceID, templateID, presetID, DeprovisionModeNormal)
747805
}, &database.TxOptions{
748806
Isolation: sql.LevelRepeatableRead,
749807
ReadOnly: false,
@@ -758,6 +816,7 @@ func (c *StoreReconciler) provision(
758816
presetID uuid.UUID,
759817
transition database.WorkspaceTransition,
760818
workspace database.Workspace,
819+
mode DeprovisionMode,
761820
) error {
762821
tvp, err := db.GetPresetParametersByTemplateVersionID(ctx, template.ActiveVersionID)
763822
if err != nil {
@@ -795,6 +854,11 @@ func (c *StoreReconciler) provision(
795854
builder = builder.RichParameterValues(params)
796855
}
797856

857+
// Use orphan mode for deletes when no Terraform resources exist
858+
if transition == database.WorkspaceTransitionDelete && mode == DeprovisionModeOrphan {
859+
builder = builder.Orphan()
860+
}
861+
798862
_, provisionerJob, _, err := builder.Build(
799863
ctx,
800864
db,

0 commit comments

Comments
 (0)