Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: concurrency issue replay #174

Merged
merged 4 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 31 additions & 46 deletions internal/store/postgres/scheduler/replay_repository.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ const (
replayColumns = `id, ` + replayColumnsToStore + `, created_at`

replayRunColumns = `replay_id, scheduled_at, status`
replayRunDetailColumns = `id as replay_id, job_name, namespace_name, project_name, start_time, end_time, description,
parallel, job_config, r.status as replay_status, r.message as replay_message, scheduled_at, run.status as run_status, r.created_at as replay_created_at`
replayRunDetailColumns = `r.id as replay_id, r.job_name, r.namespace_name, r.project_name, r.start_time, r.end_time, r.description,
r.parallel, r.job_config, r.status as replay_status, r.message as replay_message, run.scheduled_at, run.status as run_status, r.created_at as replay_created_at`

updateReplayRequest = `UPDATE replay_request SET status = $1, message = $2, updated_at = NOW() WHERE id = $3`
)
Expand Down Expand Up @@ -146,34 +146,14 @@ func (r ReplayRepository) RegisterReplay(ctx context.Context, replay *scheduler.
}

func (r ReplayRepository) GetReplayToExecute(ctx context.Context) (*scheduler.ReplayWithRun, error) {
tx, err := r.db.BeginTx(ctx, pgx.TxOptions{})
replayRuns, err := r.getExecutableReplayRuns(ctx)
if err != nil {
return nil, err
}

replayRuns, err := r.getExecutableReplayRuns(ctx, tx)
if err != nil {
tx.Rollback(ctx)
return nil, err
}
if replayRuns == nil {
tx.Rollback(ctx)
return nil, errors.NotFound(scheduler.EntityJobRun, "no executable replay request found")
}

storedReplay, err := toReplay(replayRuns)
if err != nil {
tx.Rollback(ctx)
return nil, err
}

// TODO: Avoid having In Progress, but instead use row lock (for update)
if _, err := tx.Exec(ctx, updateReplayRequest, scheduler.ReplayStateInProgress, "", storedReplay.Replay.ID()); err != nil {
tx.Rollback(ctx)
return nil, errors.Wrap(scheduler.EntityJobRun, "unable to update replay", err)
}
tx.Commit(ctx)
return storedReplay, nil
return toReplay(replayRuns)
}

func (r ReplayRepository) GetReplayRequestsByStatus(ctx context.Context, statusList []scheduler.ReplayState) ([]*scheduler.Replay, error) {
Expand Down Expand Up @@ -304,11 +284,10 @@ func (r ReplayRepository) UpdateReplayStatus(ctx context.Context, id uuid.UUID,
}

func (r ReplayRepository) UpdateReplay(ctx context.Context, id uuid.UUID, replayStatus scheduler.ReplayState, runs []*scheduler.JobRunStatus, message string) error {
if err := r.updateReplayRequest(ctx, id, replayStatus, message); err != nil {
if err := r.updateReplayRuns(ctx, id, runs); err != nil {
return err
}

return r.updateReplayRuns(ctx, id, runs)
return r.updateReplayRequest(ctx, id, replayStatus, message)
}

func (r ReplayRepository) GetReplayJobConfig(ctx context.Context, jobTenant tenant.Tenant, jobName scheduler.JobName, scheduledAt time.Time) (map[string]string, error) {
Expand Down Expand Up @@ -347,15 +326,13 @@ func (r ReplayRepository) updateReplayRuns(ctx context.Context, id uuid.UUID, ru
if err != nil {
return err
}

deleteRuns := `DELETE FROM replay_run WHERE replay_id = $1`
if _, err := tx.Exec(ctx, deleteRuns, id); err != nil {
tx.Rollback(ctx)
return errors.Wrap(scheduler.EntityJobRun, "unable to delete runs of replay", err)
}
if err := r.insertReplayRuns(ctx, tx, id, runs); err != nil {
tx.Rollback(ctx)
return errors.Wrap(scheduler.EntityJobRun, "unable to insert runs of replay", err)
query := `UPDATE replay_run SET status=$1, updated_at=NOW() WHERE replay_id=$2 AND scheduled_at=$3 AND status<>$1`
for _, run := range runs {
_, err := tx.Exec(ctx, query, run.State, id, run.ScheduledAt)
if err != nil {
tx.Rollback(ctx)
return errors.Wrap(scheduler.EntityJobRun, "unable to update replay runs", err)
}
}
tx.Commit(ctx)
return nil
Expand Down Expand Up @@ -411,18 +388,26 @@ func (r ReplayRepository) getReplayRuns(ctx context.Context, replayID uuid.UUID)
return runs, nil
}

func (ReplayRepository) getExecutableReplayRuns(ctx context.Context, tx pgx.Tx) ([]*replayRun, error) {
getReplayRequest := `
WITH request AS (
SELECT ` + replayColumns + ` FROM replay_request WHERE status IN ('created', 'partial replayed', 'replayed')
ORDER BY updated_at DESC LIMIT 1
func (r ReplayRepository) getExecutableReplayRuns(ctx context.Context) ([]*replayRun, error) {
query := `
UPDATE replay_request
SET status = $1, message = $2, updated_at = NOW()
FROM
replay_run AS run
JOIN replay_request AS r
ON (replay_id = r.id)
WHERE r.id = (
SELECT id FROM replay_request
WHERE status IN ('created', 'partial replayed', 'replayed')
ORDER BY updated_at DESC
FOR UPDATE SKIP LOCKED
LIMIT 1
)
SELECT ` + replayRunDetailColumns + ` FROM replay_run AS run
JOIN request AS r ON (replay_id = r.id)`
RETURNING ` + replayRunDetailColumns + `;`

rows, err := tx.Query(ctx, getReplayRequest)
rows, err := r.db.Query(ctx, query, scheduler.ReplayStateInProgress, "")
if err != nil {
return nil, errors.Wrap(job.EntityJob, "unable to get the stored replay", err)
return nil, errors.Wrap(job.EntityJob, "unable to get and update status of the stored replay", err)
}
defer rows.Close()

Expand All @@ -431,7 +416,7 @@ func (ReplayRepository) getExecutableReplayRuns(ctx context.Context, tx pgx.Tx)
var run replayRun
if err := rows.Scan(&run.ID, &run.JobName, &run.NamespaceName, &run.ProjectName, &run.StartTime, &run.EndTime,
&run.Description, &run.Parallel, &run.JobConfig, &run.ReplayStatus, &run.Message, &run.ScheduledTime, &run.RunStatus, &run.CreatedAt); err != nil {
return runs, errors.Wrap(scheduler.EntityJobRun, "unable to get the stored replay", err)
return runs, errors.Wrap(scheduler.EntityJobRun, "unable to scan the stored replay runs", err)
}
runs = append(runs, &run)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func TestPostgresSchedulerRepository(t *testing.T) {
})

t.Run("UpdateReplay", func(t *testing.T) {
t.Run("updates replay request and reinsert the runs", func(t *testing.T) {
t.Run("updates replay request and update the runs", func(t *testing.T) {
db := dbSetup()
replayRepo := postgres.NewReplayRepository(db)

Expand Down
Loading