From 171fc894547e0fdb0dc9e38a82cdd18754c8a845 Mon Sep 17 00:00:00 2001 From: Forrest Babcock Date: Thu, 14 May 2026 15:59:23 -0400 Subject: [PATCH 1/5] TRT-1989: model changes to prep for partitioning --- cmd/sippy/seed_data.go | 24 ++++++----- pkg/api/job_runs_test.go | 9 ++-- pkg/dataloader/prowloader/prow.go | 46 ++++++++++++--------- pkg/db/models/prow.go | 17 +++++++- pkg/sippyserver/pr_new_tests_worker_test.go | 10 +++-- 5 files changed, 67 insertions(+), 39 deletions(-) diff --git a/cmd/sippy/seed_data.go b/cmd/sippy/seed_data.go index d3fd762234..6550bf7729 100644 --- a/cmd/sippy/seed_data.go +++ b/cmd/sippy/seed_data.go @@ -574,10 +574,11 @@ func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrK for i := range totalRuns { timestamp := start.Add(time.Duration(i) * interval) run := models.ProwJobRun{ - ProwJobID: prowJob.ID, - Cluster: "build01", - Timestamp: timestamp, - Duration: 3 * time.Hour, + ProwJobID: prowJob.ID, + ProwJobRelease: prowJob.Release, + Cluster: "build01", + Timestamp: timestamp, + Duration: 3 * time.Hour, } if err := dbc.DB.Create(&run).Error; err != nil { return 0, 0, fmt.Errorf("failed to create ProwJobRun: %w", err) @@ -616,12 +617,15 @@ func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrK } result := models.ProwJobRunTest{ - ProwJobRunID: runIDs[i], - TestID: testID, - SuiteID: &suite.ID, - Status: status, - Duration: 5.0, - CreatedAt: start.Add(time.Duration(i) * interval), + ProwJobRunID: runIDs[i], + ProwJobID: prowJob.ID, + ProwJobRunRelease: prowJob.Release, + ProwJobRunTimestamp: start.Add(time.Duration(i) * interval), + TestID: testID, + SuiteID: &suite.ID, + Status: status, + Duration: 5.0, + CreatedAt: start.Add(time.Duration(i) * interval), } if err := dbc.DB.Create(&result).Error; err != nil { return 0, 0, fmt.Errorf("failed to create ProwJobRunTest: %w", err) diff --git a/pkg/api/job_runs_test.go b/pkg/api/job_runs_test.go index 9f3cf2edb6..58e7ba0092 100644 --- a/pkg/api/job_runs_test.go +++ b/pkg/api/job_runs_test.go @@ -220,9 +220,11 @@ func TestRunJobAnalysis(t *testing.T) { for _, t := range tests { fakeProwJobRun.Tests = append(fakeProwJobRun.Tests, models.ProwJobRunTest{ - Test: models.Test{Name: t.Name}, - Suite: models.Suite{Name: t.SuiteName}, - Status: 12, + ProwJobID: fakeProwJobRun.ProwJobID, + ProwJobRunRelease: fakeProwJobRun.ProwJobRelease, + Test: models.Test{Name: t.Name}, + Suite: models.Suite{Name: t.SuiteName}, + Status: 12, }) } @@ -289,6 +291,7 @@ func buildFakeProwJobRun() *models.ProwJobRun { }, }, ProwJobID: 1000000000, + ProwJobRelease: "4.12", URL: "https://example.com/run/1000000000", Tests: []models.ProwJobRunTest{}, // will be populated in the test cases TestCount: 5, diff --git a/pkg/dataloader/prowloader/prow.go b/pkg/dataloader/prowloader/prow.go index f01cbea16a..33f7f2ad20 100644 --- a/pkg/dataloader/prowloader/prow.go +++ b/pkg/dataloader/prowloader/prow.go @@ -930,7 +930,7 @@ func (pl *ProwLoader) createOrUpdateProwJob(ctx context.Context, pj *prow.ProwJo } func (pl *ProwLoader) processGCSBucketJobRun(ctx context.Context, pj *prow.ProwJob, id uint64, path string, junitMatches []string, dbProwJob *models.ProwJob) error { - tests, failures, overallResult, err := pl.prowJobRunTestsFromGCS(ctx, pj, uint(id), path, junitMatches) + tests, failures, overallResult, err := pl.prowJobRunTestsFromGCS(ctx, pj, uint(id), dbProwJob.ID, dbProwJob.Release, path, junitMatches) if err != nil { return err } @@ -959,19 +959,20 @@ func (pl *ProwLoader) processGCSBucketJobRun(ctx context.Context, pj *prow.ProwJ Model: gorm.Model{ ID: uint(id), }, - Cluster: pj.Spec.Cluster, - Duration: duration, - ProwJob: *dbProwJob, - ProwJobID: dbProwJob.ID, - URL: pj.Status.URL, - GCSBucket: pj.Spec.DecorationConfig.GCSConfiguration.Bucket, - Timestamp: pj.Status.StartTime, - OverallResult: overallResult, - PullRequests: pulls, - TestFailures: failures, - Succeeded: overallResult == sippyprocessingv1.JobSucceeded, - Labels: labels, - Annotations: annotations, + Cluster: pj.Spec.Cluster, + Duration: duration, + ProwJob: *dbProwJob, + ProwJobID: dbProwJob.ID, + ProwJobRelease: dbProwJob.Release, + URL: pj.Status.URL, + GCSBucket: pj.Spec.DecorationConfig.GCSConfiguration.Bucket, + Timestamp: pj.Status.StartTime, + OverallResult: overallResult, + PullRequests: pulls, + TestFailures: failures, + Succeeded: overallResult == sippyprocessingv1.JobSucceeded, + Labels: labels, + Annotations: annotations, }).Error if err != nil { return err @@ -1187,7 +1188,7 @@ func (pl *ProwLoader) findSuite(name string) *uint { return id } -func (pl *ProwLoader) prowJobRunTestsFromGCS(ctx context.Context, pj *prow.ProwJob, id uint, path string, junitPaths []string) ([]*models.ProwJobRunTest, int, sippyprocessingv1.JobOverallResult, error) { +func (pl *ProwLoader) prowJobRunTestsFromGCS(ctx context.Context, pj *prow.ProwJob, id, prowJobID uint, prowJobRelease, path string, junitPaths []string) ([]*models.ProwJobRunTest, int, sippyprocessingv1.JobOverallResult, error) { failures := 0 bkt := pl.gcsClient.Bucket(pj.Spec.DecorationConfig.GCSConfiguration.Bucket) @@ -1206,7 +1207,7 @@ func (pl *ProwLoader) prowJobRunTestsFromGCS(ctx context.Context, pj *prow.ProwJ continue } - pl.extractTestCases(suite, suiteID, testCases) + pl.extractTestCases(suite, suiteID, testCases, prowJobRelease, pj.Status.StartTime) } syntheticSuite, jobResult := testconversion.ConvertProwJobRunToSyntheticTests(*pj, testCases, pl.syntheticTestManager) @@ -1216,7 +1217,7 @@ func (pl *ProwLoader) prowJobRunTestsFromGCS(ctx context.Context, pj *prow.ProwJ // this shouldn't happen but if it does we want to know panic("synthetic suite is missing from the database") } - pl.extractTestCases(syntheticSuite, suiteID, testCases) + pl.extractTestCases(syntheticSuite, suiteID, testCases, prowJobRelease, pj.Status.StartTime) log.Infof("synthetic suite had %d tests", syntheticSuite.NumTests) results := make([]*models.ProwJobRunTest, 0) @@ -1226,6 +1227,9 @@ func (pl *ProwLoader) prowJobRunTestsFromGCS(ctx context.Context, pj *prow.ProwJ } testCases[k].ProwJobRunID = id + testCases[k].ProwJobID = prowJobID + testCases[k].ProwJobRunRelease = prowJobRelease + testCases[k].ProwJobRunTimestamp = pj.Status.StartTime results = append(results, testCases[k]) if testCases[k].Status == 12 { failures++ @@ -1235,7 +1239,7 @@ func (pl *ProwLoader) prowJobRunTestsFromGCS(ctx context.Context, pj *prow.ProwJ return results, failures, jobResult, nil } -func (pl *ProwLoader) extractTestCases(suite *junit.TestSuite, suiteID *uint, testCases map[string]*models.ProwJobRunTest) { +func (pl *ProwLoader) extractTestCases(suite *junit.TestSuite, suiteID *uint, testCases map[string]*models.ProwJobRunTest, prowJobRelease string, prowJobStartTime time.Time) { for _, tc := range suite.TestCases { if testidentification.IsIgnoredTest(tc.Name) { @@ -1250,7 +1254,9 @@ func (pl *ProwLoader) extractTestCases(suite *junit.TestSuite, suiteID *uint, te status = sippyprocessingv1.TestStatusSuccess default: failureOutput = &models.ProwJobRunTestOutput{ - Output: tc.FailureOutput.Output, + Output: tc.FailureOutput.Output, + ProwJobRunTestTimestamp: prowJobStartTime, + ProwJobRunTestRelease: prowJobRelease, } } @@ -1283,6 +1289,6 @@ func (pl *ProwLoader) extractTestCases(suite *junit.TestSuite, suiteID *uint, te } for _, c := range suite.Children { - pl.extractTestCases(c, suiteID, testCases) + pl.extractTestCases(c, suiteID, testCases, prowJobRelease, prowJobStartTime) } } diff --git a/pkg/db/models/prow.go b/pkg/db/models/prow.go index 18fd49e037..646ec96080 100644 --- a/pkg/db/models/prow.go +++ b/pkg/db/models/prow.go @@ -39,6 +39,8 @@ type ProwJobRun struct { // ProwJob is a link to the prow job this run belongs to. ProwJob ProwJob ProwJobID uint `gorm:"index"` + // Used for partitioning + ProwJobRelease string `gorm:"varchar(10)"` // Cluster is the cluster where the prow job was run. Cluster string @@ -87,8 +89,15 @@ type ProwJobRunTest struct { gorm.Model ProwJobRunID uint `gorm:"index"` ProwJobRun ProwJobRun - TestID uint `gorm:"index;index:idx_prow_job_run_tests_test_id_status"` - Test Test + // used for variants + // skips joining on ProwJobRunID just to get ProwJobID + ProwJobID uint + // used for partitioning + ProwJobRunTimestamp time.Time `gorm:"expression:DATE(timestamp AT TIME ZONE 'UTC')"` + // used for partitioning + ProwJobRunRelease string `gorm:"varchar(10)"` + TestID uint `gorm:"index;index:idx_prow_job_run_tests_test_id_status"` + Test Test // SuiteID may be nil if no suite name could be parsed from the testgrid test name. SuiteID *uint `gorm:"index"` Suite Suite @@ -107,6 +116,10 @@ type ProwJobRunTestOutput struct { ProwJobRunTestID uint `gorm:"index"` // Output stores the output of a ProwJobRunTest. Output string + // used for partitioning + ProwJobRunTestTimestamp time.Time `gorm:"expression:DATE(timestamp AT TIME ZONE 'UTC')"` + // used for partitioning + ProwJobRunTestRelease string `gorm:"varchar(10)"` } // Suite defines a junit testsuite. Used to differentiate the same test being run in different suites in ProwJobRunTest. diff --git a/pkg/sippyserver/pr_new_tests_worker_test.go b/pkg/sippyserver/pr_new_tests_worker_test.go index a193fbe06a..7a134d41ab 100644 --- a/pkg/sippyserver/pr_new_tests_worker_test.go +++ b/pkg/sippyserver/pr_new_tests_worker_test.go @@ -233,10 +233,11 @@ func TestUnit_getNewTestsForJobRun(t *testing.T) { name: "successful fetch", fetchJobRun: func(dbc *db.DB, jobRunID int64, unknownTests bool, preloads []string, logger *logrus.Entry) (*models.ProwJobRun, error) { pjr := models.ProwJobRun{ + ProwJobRelease: "4.12", Tests: []models.ProwJobRunTest{ - {Test: models.Test{Name: "test1"}, Status: int(v1.TestStatusSuccess)}, - {Test: models.Test{Name: "test2"}, Status: int(v1.TestStatusFailure)}, - {Test: models.Test{Name: "test3"}, Status: int(v1.TestStatusFlake)}, + {ProwJobID: 1, ProwJobRunRelease: "4.12", Test: models.Test{Name: "test1"}, Status: int(v1.TestStatusSuccess)}, + {ProwJobID: 1, ProwJobRunRelease: "4.12", Test: models.Test{Name: "test2"}, Status: int(v1.TestStatusFailure)}, + {ProwJobID: 1, ProwJobRunRelease: "4.12", Test: models.Test{Name: "test3"}, Status: int(v1.TestStatusFlake)}, }, } pjr.ID = 12345 // Gorm model ID for some reason can't be put in the struct literal @@ -252,8 +253,9 @@ func TestUnit_getNewTestsForJobRun(t *testing.T) { name: "error on filtering", fetchJobRun: func(dbc *db.DB, jobRunID int64, unknownTests bool, preloads []string, logger *logrus.Entry) (*models.ProwJobRun, error) { pjr := models.ProwJobRun{ + ProwJobRelease: "4.12", Tests: []models.ProwJobRunTest{ - {Test: models.Test{Name: "test1"}, Status: int(v1.TestStatusSuccess)}, + {ProwJobID: 1, ProwJobRunRelease: "4.12", Test: models.Test{Name: "test1"}, Status: int(v1.TestStatusSuccess)}, }, } pjr.ID = 12345 // Gorm model ID for some reason can't be put in the struct literal From 9039e19b54d847f5ba5add5441ee1d8b61951638 Mon Sep 17 00:00:00 2001 From: Forrest Babcock Date: Mon, 18 May 2026 13:11:09 -0400 Subject: [PATCH 2/5] TRT-1989: clean up gorm annotations --- pkg/db/models/prow.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/db/models/prow.go b/pkg/db/models/prow.go index 646ec96080..1953e33d90 100644 --- a/pkg/db/models/prow.go +++ b/pkg/db/models/prow.go @@ -17,7 +17,7 @@ type ProwJob struct { Kind ProwKind Name string `gorm:"unique"` - Release string `gorm:"varchar(10);index"` + Release string `gorm:"index"` Variants pq.StringArray `gorm:"type:text[];index:idx_prow_jobs_variants,type:gin"` TestGridURL string // Bugs maps to all the bugs we scanned and found this prowjob name mentioned in the description or any comment. @@ -40,7 +40,7 @@ type ProwJobRun struct { ProwJob ProwJob ProwJobID uint `gorm:"index"` // Used for partitioning - ProwJobRelease string `gorm:"varchar(10)"` + ProwJobRelease string // Cluster is the cluster where the prow job was run. Cluster string @@ -93,10 +93,10 @@ type ProwJobRunTest struct { // skips joining on ProwJobRunID just to get ProwJobID ProwJobID uint // used for partitioning - ProwJobRunTimestamp time.Time `gorm:"expression:DATE(timestamp AT TIME ZONE 'UTC')"` + ProwJobRunTimestamp time.Time // used for partitioning - ProwJobRunRelease string `gorm:"varchar(10)"` - TestID uint `gorm:"index;index:idx_prow_job_run_tests_test_id_status"` + ProwJobRunRelease string + TestID uint `gorm:"index;index:idx_prow_job_run_tests_test_id_status"` Test Test // SuiteID may be nil if no suite name could be parsed from the testgrid test name. SuiteID *uint `gorm:"index"` @@ -117,9 +117,9 @@ type ProwJobRunTestOutput struct { // Output stores the output of a ProwJobRunTest. Output string // used for partitioning - ProwJobRunTestTimestamp time.Time `gorm:"expression:DATE(timestamp AT TIME ZONE 'UTC')"` + ProwJobRunTestTimestamp time.Time // used for partitioning - ProwJobRunTestRelease string `gorm:"varchar(10)"` + ProwJobRunTestRelease string } // Suite defines a junit testsuite. Used to differentiate the same test being run in different suites in ProwJobRunTest. From d2ba6418650cffed576c29bacd002758cb73746c Mon Sep 17 00:00:00 2001 From: Forrest Babcock Date: Tue, 19 May 2026 10:25:04 -0400 Subject: [PATCH 3/5] TRT-1989: migration prep planning --- docs/plans/trt-1989-partitioning-prep.md | 221 +++++++++++++++++++++++ pkg/dataloader/prowloader/prow.go | 18 +- pkg/db/db.go | 7 + pkg/db/models/prow.go | 18 +- 4 files changed, 258 insertions(+), 6 deletions(-) create mode 100644 docs/plans/trt-1989-partitioning-prep.md diff --git a/docs/plans/trt-1989-partitioning-prep.md b/docs/plans/trt-1989-partitioning-prep.md new file mode 100644 index 0000000000..718b95329e --- /dev/null +++ b/docs/plans/trt-1989-partitioning-prep.md @@ -0,0 +1,221 @@ +# TRT-1989: Database Partitioning Preparation + +**Date:** 2026-05-19 +**JIRA:** [TRT-1989](https://redhat.atlassian.net/browse/TRT-1989) + +## Problem Statement + +The largest tables in sippy (`prow_job_runs`, `prow_job_run_tests`, +`prow_job_run_test_outputs`) grow continuously and are joined together in +nearly every significant query. Partitioning these tables by release and +timestamp would allow PostgreSQL to prune irrelevant partitions during +queries, dramatically reducing scan sizes and join costs. + +PostgreSQL requires that a partitioned table's primary/unique key includes +the partition key columns. Foreign keys referencing a partitioned table must +match the full unique key. This means child tables that join to a partitioned +parent need the partition key columns — either to form composite FKs or to be +co-partitioned themselves. + +## Approach + +Denormalize `release` and `timestamp` from `prow_job_runs` (via `prow_jobs`) +onto every table that will be partitioned or that holds a FK into a +partitioned table. This is a prep step — no partitioning is applied yet, but +the columns are populated so the migration to partitioned tables can happen +in a subsequent PR. + +## Tables Modified + +### prow_job_runs + +- Added `ProwJobRelease` (denormalized from `prow_jobs.release`) +- The `Timestamp` column already exists on this table + +### prow_job_run_tests + +- Added `ProwJobID` (avoids joining through `prow_job_runs` just to get the + job ID, also needed for variant queries) +- Added `ProwJobRunRelease` +- Added `ProwJobRunTimestamp` + +### prow_job_run_test_outputs + +- Added `ProwJobRunTestRelease` +- Added `ProwJobRunTestTimestamp` +- Named with `ProwJobRunTest` prefix to avoid field name collisions when + GORM preloads `ProwJobRunTestOutput` nested inside `ProwJobRunTest` + +### prow_job_run_prow_pull_requests (many-to-many join table) + +- Converted from implicit GORM-managed join table to explicit model + (`ProwJobRunProwPullRequest`) using `SetupJoinTable` +- Added `ProwJobRunRelease` and `ProwJobRunTimestamp` +- Pull request association changed from GORM auto-association to manual + insert so the denormalized fields are populated +- This table must migrate to partitioned at the same time as `prow_job_runs` + because it holds a FK into that table + +### prow_job_run_annotations + +- Added `ProwJobRunRelease` and `ProwJobRunTimestamp` +- Same FK constraint as pull requests — must migrate with `prow_job_runs` + +## Tables Not Modified (and why) + +- **prow_jobs** — parent table, not partitioned, small (one row per job + definition). Release already exists here as the source of truth. +- **tests, suites, test_ownerships, jira_components** — small dimension + tables joined by their own primary key. No partition benefit. +- **bugs, bug_jobs, bug_tests** — small tables joined to `prow_jobs`, not + to `prow_job_runs`. No partition benefit. +- **release_job_runs** — joins to `prow_job_run_tests` via + `prow_job_run_id`, but is relatively small (one row per payload job run). + Can be addressed later if needed. +- **RegressionJobRun** — stores a string `ProwJobRunID` from BigQuery, not + a FK to `prow_job_runs`. Independent lifecycle. + +## Migration Constraints + +When `prow_job_runs` is partitioned: + +1. Its primary key must include the partition key columns (e.g. + `(id, prow_job_release, timestamp)`). +2. All tables with FKs into `prow_job_runs` must reference the full + composite key — meaning they need the partition key columns too. +3. Tables with FKs **from** `prow_job_runs` to non-partitioned tables + (annotations, pull request join table) must either be co-partitioned + or have their FKs dropped. + +This means `prow_job_runs`, `prow_job_run_annotations`, and +`prow_job_run_prow_pull_requests` must all migrate to partitioned in a +single step. The cascade delete constraints on these relationships +(`constraint:OnDelete:CASCADE`) remain in place for now and will be +addressed during the partitioning migration. + +## Backfill + +Existing rows will have NULL/zero-value for the new columns. A backfill +migration is required before partitioning: + +```sql +UPDATE prow_job_runs r + SET prow_job_release = j.release + FROM prow_jobs j + WHERE r.prow_job_id = j.id + AND r.prow_job_release IS NULL OR r.prow_job_release = ''; + +UPDATE prow_job_run_tests t + SET prow_job_id = r.prow_job_id, + prow_job_run_release = j.release, + prow_job_run_timestamp = r.timestamp + FROM prow_job_runs r + JOIN prow_jobs j ON r.prow_job_id = j.id + WHERE t.prow_job_run_id = r.id + AND (t.prow_job_run_release IS NULL OR t.prow_job_run_release = ''); +``` + +Similar updates for `prow_job_run_test_outputs`, `prow_job_run_annotations`, +and `prow_job_run_prow_pull_requests`. + +## Phased Migration Plan + +The migration to partitioned tables can be done incrementally. Each phase +delivers value on its own and validates the approach before the next step. + +### Phase 1: Column Prep (this PR) + +Add denormalized release and timestamp columns to all tables. New data is +populated on insert. No query changes, no indexing changes. + +### Phase 2: Backfill + Index + +Backfill existing rows (see SQL above). Add composite indexes that mirror +the future partition key: + +```sql +CREATE INDEX CONCURRENTLY idx_prow_job_runs_release_timestamp + ON prow_job_runs (prow_job_release, timestamp); + +CREATE INDEX CONCURRENTLY idx_prow_job_run_tests_release_timestamp + ON prow_job_run_tests (prow_job_run_release, prow_job_run_timestamp); + +CREATE INDEX CONCURRENTLY idx_prow_job_run_test_outputs_release_timestamp + ON prow_job_run_test_outputs (prow_job_run_test_release, prow_job_run_test_timestamp); + +CREATE INDEX CONCURRENTLY idx_prow_job_run_annotations_release_timestamp + ON prow_job_run_annotations (prow_job_run_release, prow_job_run_timestamp); + +CREATE INDEX CONCURRENTLY idx_prow_job_run_prow_pull_requests_release_timestamp + ON prow_job_run_prow_pull_requests (prow_job_run_release, prow_job_run_timestamp); +``` + +### Phase 3: Migrate Queries + +Update queries to filter on the denormalized columns instead of (or in +addition to) the joined parent columns. This can be done incrementally — +each query can be updated, validated with `EXPLAIN ANALYZE`, and merged +independently. + +Queries currently filter on `prow_job_runs.timestamp` and +`prow_jobs.release` via joins. Once the child tables are partitioned, +those filters won't help the planner prune child table partitions. Each +query needs filters on the partitioned table's own columns. + +#### Queries that need release + timestamp filters added + +| Query | File | Currently filters on | Add filter on | +|-------|------|---------------------|---------------| +| `TestOutputs` | `pkg/db/query/test_queries.go:285` | `prow_job_runs.timestamp`, `prow_jobs.release` | `prow_job_run_test_outputs.prow_job_run_test_timestamp`, `.prow_job_run_test_release` | +| `TestDurations` | `pkg/db/query/test_queries.go:315` | `prow_job_runs.timestamp`, `prow_jobs.release` | `prow_job_run_tests.prow_job_run_timestamp`, `.prow_job_run_release` | +| `GetRecentTestFailures` | `pkg/api/recent_test_failures.go:32` | `prow_job_runs.timestamp`, `prow_jobs.release` | `prow_job_run_tests.prow_job_run_timestamp`, `.prow_job_run_release` | +| `testReportMatView` | `pkg/db/views.go:244-265` | `prow_job_runs."timestamp"` | `prow_job_run_tests.prow_job_run_timestamp`, `.prow_job_run_release` | +| `testAnalysisByJobMatView` | `pkg/db/views.go:298-308` | `prow_job_runs."timestamp"` | `prow_job_run_tests.prow_job_run_timestamp`, `.prow_job_run_release` | +| `prowJobFailedTestsMatView` | `pkg/db/views.go:314-322` | `prow_job_runs."timestamp"` (none explicit) | `prow_job_run_tests.prow_job_run_timestamp` | +| `payloadTestFailuresMatView` | `pkg/db/views.go:348` | `release_tags.release_time` | `prow_job_run_tests.prow_job_run_timestamp` | +| `testStatusQuery` (CR) | `pkg/api/componentreadiness/.../provider.go:312` | `pjr.timestamp`, `pj.release` | `pjrt.prow_job_run_timestamp`, `pjrt.prow_job_run_release` | +| `testDetailQuery` (CR) | `pkg/api/componentreadiness/.../provider.go:518` | `pjr.timestamp`, `pj.release` | `pjrt.prow_job_run_timestamp`, `pjrt.prow_job_run_release` | +| `test_results()` function | `pkg/db/functions.go:51-53` | `timestamp` (via join) | `prow_job_run_tests.prow_job_run_timestamp`, `.prow_job_run_release` | + +#### Queries that need filters added (currently have none) + +| Query | File | Tables scanned | Fix | +|-------|------|---------------|-----| +| `jobRunsReportMatView` CTEs | `pkg/db/views.go:157-192` | `prow_job_run_tests`, `prow_job_run_prow_pull_requests` | Add timestamp/release filters to each CTE | +| `job_results()` `repo_org_jobs` CTE | `pkg/db/functions.go:82-88` | `prow_job_runs`, `prow_job_run_prow_pull_requests` | Add release + timestamp filter | +| `job_results()` `merged_prs` CTE | `pkg/db/functions.go:91-99` | `prow_job_runs`, `prow_job_run_prow_pull_requests` | Add `prow_job_runs.timestamp` filter alongside `merged_at` | +| `HasBuildClusterData` | `pkg/db/query/build_clusters.go:14` | `prow_job_runs` | Add timestamp bound | +| `PrintOverallReleaseHealthFromDB` | `pkg/api/health.go:88` | `prow_job_runs` | Add release filter or timestamp bound | +| `PrintAutocompleteFromDB` (cluster) | `pkg/api/autocomplete.go:77` | `prow_job_runs` | Add timestamp bound | +| `ProwJobRunIDs` | `pkg/db/query/job_queries.go:59` | `prow_job_runs` | Add timestamp parameters | +| `BuildClusterHealth` | `pkg/db/query/build_clusters.go:21` | `prow_job_runs` | Move timestamp from CASE to WHERE | +| `BuildClusterAnalysis` | `pkg/db/query/build_clusters.go:60` | `prow_job_runs` | Add release filter | + +#### Queries that can potentially drop joins + +Once queries filter on the denormalized columns directly, some joins +become unnecessary. For example, queries that only joined +`prow_job_runs` to get `timestamp` or joined `prow_jobs` to get +`release` can use the local columns instead. This reduces join cost +independent of partitioning. + +| Query | Join that can be dropped | Reason | +|-------|-------------------------|--------| +| `testAnalysisByJobMatView` | `JOIN prow_job_runs` | Only used for `timestamp` and to reach `prow_jobs` | +| `prowJobFailedTestsMatView` | `JOIN prow_job_runs` (partial) | Used for `timestamp` and `prow_job_id` — both now on `prow_job_run_tests` | +| `test_results()` function | `JOIN prow_job_runs` | Only used for `timestamp` and to reach `prow_jobs.release` | + +### Phase 4: Partition Tables + +With columns populated, indexes in place, and queries updated, apply +partitioning. This requires: + +1. Migrate `prow_job_runs`, `prow_job_run_annotations`, and + `prow_job_run_prow_pull_requests` together (FK constraints). +2. Migrate `prow_job_run_tests` (FK to `prow_job_runs` must use + composite key). +3. Migrate `prow_job_run_test_outputs` (FK to `prow_job_run_tests` + must use composite key). +4. Update cascade delete constraints or drop them in favor of + partition-based data lifecycle management. +5. Validate with `EXPLAIN ANALYZE` that partition pruning is active. diff --git a/pkg/dataloader/prowloader/prow.go b/pkg/dataloader/prowloader/prow.go index 33f7f2ad20..fec6bd8656 100644 --- a/pkg/dataloader/prowloader/prow.go +++ b/pkg/dataloader/prowloader/prow.go @@ -945,8 +945,10 @@ func (pl *ProwLoader) processGCSBucketJobRun(ctx context.Context, pj *prow.ProwJ var annotations []models.ProwJobRunAnnotation for k, v := range pj.Annotations { annotations = append(annotations, models.ProwJobRunAnnotation{ - Key: k, - Value: v, + Key: k, + Value: v, + ProwJobRunRelease: dbProwJob.Release, + ProwJobRunTimestamp: pj.Status.StartTime, }) } @@ -968,7 +970,6 @@ func (pl *ProwLoader) processGCSBucketJobRun(ctx context.Context, pj *prow.ProwJ GCSBucket: pj.Spec.DecorationConfig.GCSConfiguration.Bucket, Timestamp: pj.Status.StartTime, OverallResult: overallResult, - PullRequests: pulls, TestFailures: failures, Succeeded: overallResult == sippyprocessingv1.JobSucceeded, Labels: labels, @@ -977,6 +978,17 @@ func (pl *ProwLoader) processGCSBucketJobRun(ctx context.Context, pj *prow.ProwJ if err != nil { return err } + + for _, pull := range pulls { + if err := pl.dbc.DB.WithContext(ctx).Create(&models.ProwJobRunProwPullRequest{ + ProwJobRunID: uint(id), + ProwPullRequestID: pull.ID, + ProwJobRunRelease: dbProwJob.Release, + ProwJobRunTimestamp: pj.Status.StartTime, + }).Error; err != nil { + return err + } + } // Looks like sometimes, we might be getting duplicate entries from bigquery: pl.prowJobRunCacheLock.Lock() pl.prowJobRunCache[uint(id)] = true diff --git a/pkg/db/db.go b/pkg/db/db.go index 478d8ae311..4ae99345e1 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -72,6 +72,12 @@ func (d *DB) UpdateSchema(reportEnd *time.Time) error { return err } + // Register explicit join table so GORM uses our model (with release/timestamp) + // instead of auto-generating a bare join table. + if err := d.DB.SetupJoinTable(&models.ProwJobRun{}, "PullRequests", &models.ProwJobRunProwPullRequest{}); err != nil { + return err + } + // List of all models to migrate modelsToMigrate := []any{ &models.ReleaseTag{}, @@ -88,6 +94,7 @@ func (d *DB) UpdateSchema(reportEnd *time.Time) error { &models.APISnapshot{}, &models.Bug{}, &models.ProwPullRequest{}, + &models.ProwJobRunProwPullRequest{}, &models.SchemaHash{}, &models.PullRequestComment{}, &models.JiraIncident{}, diff --git a/pkg/db/models/prow.go b/pkg/db/models/prow.go index 1953e33d90..5d9cbb6504 100644 --- a/pkg/db/models/prow.go +++ b/pkg/db/models/prow.go @@ -68,12 +68,24 @@ type ProwJobRun struct { ClusterData ClusterData `gorm:"-"` } +// ProwJobRunProwPullRequest is the explicit join table for the many-to-many relationship +// between ProwJobRun and ProwPullRequest. Release and timestamp are denormalized from +// ProwJobRun to support future partitioning. +type ProwJobRunProwPullRequest struct { + ProwJobRunID uint `gorm:"primaryKey"` + ProwPullRequestID uint `gorm:"primaryKey"` + ProwJobRunRelease string + ProwJobRunTimestamp time.Time +} + // ProwJobRunAnnotation stores a single key-value annotation for a ProwJobRun. type ProwJobRunAnnotation struct { gorm.Model - ProwJobRunID uint `gorm:"index;uniqueIndex:idx_prow_job_run_annotations_key"` - Key string `gorm:"uniqueIndex:idx_prow_job_run_annotations_key"` - Value string + ProwJobRunID uint `gorm:"index;uniqueIndex:idx_prow_job_run_annotations_key"` + Key string `gorm:"uniqueIndex:idx_prow_job_run_annotations_key"` + Value string + ProwJobRunRelease string + ProwJobRunTimestamp time.Time } type Test struct { From e6019efeb741e8b44238d2352aeb3c84980ddc7f Mon Sep 17 00:00:00 2001 From: Forrest Babcock Date: Tue, 19 May 2026 11:24:02 -0400 Subject: [PATCH 4/5] TRT-1989: migration prep feedback --- docs/plans/trt-1989-partitioning-prep.md | 2 +- pkg/dataloader/prowloader/prow.go | 61 +++++++++++++----------- pkg/db/db.go | 2 +- 3 files changed, 35 insertions(+), 30 deletions(-) diff --git a/docs/plans/trt-1989-partitioning-prep.md b/docs/plans/trt-1989-partitioning-prep.md index 718b95329e..e2f4e62e7b 100644 --- a/docs/plans/trt-1989-partitioning-prep.md +++ b/docs/plans/trt-1989-partitioning-prep.md @@ -103,7 +103,7 @@ UPDATE prow_job_runs r SET prow_job_release = j.release FROM prow_jobs j WHERE r.prow_job_id = j.id - AND r.prow_job_release IS NULL OR r.prow_job_release = ''; + AND (r.prow_job_release IS NULL OR r.prow_job_release = ''); UPDATE prow_job_run_tests t SET prow_job_id = r.prow_job_id, diff --git a/pkg/dataloader/prowloader/prow.go b/pkg/dataloader/prowloader/prow.go index fec6bd8656..02d3657784 100644 --- a/pkg/dataloader/prowloader/prow.go +++ b/pkg/dataloader/prowloader/prow.go @@ -957,37 +957,42 @@ func (pl *ProwLoader) processGCSBucketJobRun(ctx context.Context, pj *prow.ProwJ duration = pj.Status.CompletionTime.Sub(pj.Status.StartTime) } - err = pl.dbc.DB.WithContext(ctx).Create(&models.ProwJobRun{ - Model: gorm.Model{ - ID: uint(id), - }, - Cluster: pj.Spec.Cluster, - Duration: duration, - ProwJob: *dbProwJob, - ProwJobID: dbProwJob.ID, - ProwJobRelease: dbProwJob.Release, - URL: pj.Status.URL, - GCSBucket: pj.Spec.DecorationConfig.GCSConfiguration.Bucket, - Timestamp: pj.Status.StartTime, - OverallResult: overallResult, - TestFailures: failures, - Succeeded: overallResult == sippyprocessingv1.JobSucceeded, - Labels: labels, - Annotations: annotations, - }).Error - if err != nil { - return err - } - - for _, pull := range pulls { - if err := pl.dbc.DB.WithContext(ctx).Create(&models.ProwJobRunProwPullRequest{ - ProwJobRunID: uint(id), - ProwPullRequestID: pull.ID, - ProwJobRunRelease: dbProwJob.Release, - ProwJobRunTimestamp: pj.Status.StartTime, + err = pl.dbc.DB.WithContext(ctx).Transaction(func(tx *gorm.DB) error { + if err := tx.Create(&models.ProwJobRun{ + Model: gorm.Model{ + ID: uint(id), + }, + Cluster: pj.Spec.Cluster, + Duration: duration, + ProwJob: *dbProwJob, + ProwJobID: dbProwJob.ID, + ProwJobRelease: dbProwJob.Release, + URL: pj.Status.URL, + GCSBucket: pj.Spec.DecorationConfig.GCSConfiguration.Bucket, + Timestamp: pj.Status.StartTime, + OverallResult: overallResult, + TestFailures: failures, + Succeeded: overallResult == sippyprocessingv1.JobSucceeded, + Labels: labels, + Annotations: annotations, }).Error; err != nil { return err } + + for _, pull := range pulls { + if err := tx.Create(&models.ProwJobRunProwPullRequest{ + ProwJobRunID: uint(id), + ProwPullRequestID: pull.ID, + ProwJobRunRelease: dbProwJob.Release, + ProwJobRunTimestamp: pj.Status.StartTime, + }).Error; err != nil { + return err + } + } + return nil + }) + if err != nil { + return err } // Looks like sometimes, we might be getting duplicate entries from bigquery: pl.prowJobRunCacheLock.Lock() diff --git a/pkg/db/db.go b/pkg/db/db.go index 4ae99345e1..ed19837fb4 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -75,7 +75,7 @@ func (d *DB) UpdateSchema(reportEnd *time.Time) error { // Register explicit join table so GORM uses our model (with release/timestamp) // instead of auto-generating a bare join table. if err := d.DB.SetupJoinTable(&models.ProwJobRun{}, "PullRequests", &models.ProwJobRunProwPullRequest{}); err != nil { - return err + return fmt.Errorf("setup join table ProwJobRun.PullRequests: %w", err) } // List of all models to migrate From b252569d945494e4299d4cd8553669532da9b7c8 Mon Sep 17 00:00:00 2001 From: Forrest Babcock Date: Tue, 19 May 2026 12:49:14 -0400 Subject: [PATCH 5/5] TRT-1989: index enablement --- docs/plans/trt-1989-phase2-indexes.md | 128 ++++++++++++++++++++++++++ pkg/db/models/prow.go | 28 +++--- 2 files changed, 142 insertions(+), 14 deletions(-) create mode 100644 docs/plans/trt-1989-phase2-indexes.md diff --git a/docs/plans/trt-1989-phase2-indexes.md b/docs/plans/trt-1989-phase2-indexes.md new file mode 100644 index 0000000000..0a2f6d90f0 --- /dev/null +++ b/docs/plans/trt-1989-phase2-indexes.md @@ -0,0 +1,128 @@ +# TRT-1989 Phase 2: Composite Indexes on Denormalized Columns + +**Date:** 2026-05-19 +**JIRA:** [TRT-1989](https://redhat.atlassian.net/browse/TRT-1989) +**Depends on:** Phase 1 — column prep (`trt-1989-partitioning-prep.md`) + +## Purpose + +Phase 1 added denormalized `release` and `timestamp` columns to every table +that will be partitioned or holds a FK into a partitioned table. Phase 2 +adds composite indexes on those columns so the query planner can use them +immediately — before partitioning is applied. + +These indexes mirror the future partition key `(release, timestamp)`. Once +the tables are partitioned, each partition inherits a local copy of the +index, and the planner uses partition pruning instead. The indexes added +here serve two purposes: + +1. **Immediate benefit** — queries migrated in Phase 3 to filter on the + denormalized columns will use these indexes on the current + non-partitioned tables. +2. **Validation** — exercising the indexes under production workload + confirms the column data is correct before committing to partitioning. + +## Changes + +All changes are GORM index tags on model structs in +`pkg/db/models/prow.go`. GORM `AutoMigrate` creates the indexes +automatically on the next migration run. + +### prow_job_runs + +Added composite index `idx_prow_job_runs_release_timestamp` across +`ProwJobRelease` and `Timestamp`. + +Also added a standalone index on `ProwJobRunTest.ProwJobID` to support +variant queries that previously required joining through `prow_job_runs`. + +### prow_job_run_tests + +Added composite index `idx_prow_job_run_tests_release_timestamp` across +`ProwJobRunTimestamp` and `ProwJobRunRelease`. + +### prow_job_run_test_outputs + +Added composite index `idx_prow_job_run_test_outputs_release_timestamp` +across `ProwJobRunTestTimestamp` and `ProwJobRunTestRelease`. + +### prow_job_run_prow_pull_requests + +Added composite index +`idx_prow_job_run_prow_pull_requests_release_timestamp` across +`ProwJobRunRelease` and `ProwJobRunTimestamp`. + +### prow_job_run_annotations + +Added composite index `idx_prow_job_run_annotations_release_timestamp` +across `ProwJobRunRelease` and `ProwJobRunTimestamp`. + +## Explicit SQL + +GORM `AutoMigrate` will create these indexes on the next migration run. +If you prefer to create them manually — for example, using `CONCURRENTLY` +to avoid locking production tables — run these statements directly. + +`CREATE INDEX CONCURRENTLY` cannot run inside a transaction, so each +statement must be executed individually (not wrapped in `BEGIN`/`COMMIT`). + +### prow_job_runs + +```sql +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_prow_job_runs_release_timestamp + ON prow_job_runs (prow_job_release, "timestamp"); +``` + +### prow_job_run_tests + +```sql +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_prow_job_run_tests_prow_job_id + ON prow_job_run_tests (prow_job_id); + +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_prow_job_run_tests_release_timestamp + ON prow_job_run_tests (prow_job_run_timestamp, prow_job_run_release); +``` + +### prow_job_run_test_outputs + +```sql +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_prow_job_run_test_outputs_release_timestamp + ON prow_job_run_test_outputs (prow_job_run_test_timestamp, prow_job_run_test_release); +``` + +### prow_job_run_prow_pull_requests + +```sql +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_prow_job_run_prow_pull_requests_release_timestamp + ON prow_job_run_prow_pull_requests (prow_job_run_release, prow_job_run_timestamp); +``` + +### prow_job_run_annotations + +```sql +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_prow_job_run_annotations_release_timestamp + ON prow_job_run_annotations (prow_job_run_release, prow_job_run_timestamp); +``` + +## Notes + +- **Safe to create before deploying model updates.** GORM `AutoMigrate` + only adds — it never drops indexes, columns, or tables it doesn't + recognize. Indexes created manually will persist through any number of + `AutoMigrate` runs on the old model. Once the updated model with index + tags is deployed, `AutoMigrate` sees the indexes already exist and + skips them. There is no rollback risk. +- `CONCURRENTLY` avoids taking an exclusive lock on the table, allowing + reads and writes to continue during index creation. It is slower but + safe for production use. +- If the index already exists (e.g., GORM created it during a prior + migration), `IF NOT EXISTS` makes the statement a no-op. +- GORM `AutoMigrate` does **not** use `CONCURRENTLY` — it takes a brief + lock. On large tables this can block writes for the duration of the + index build. For production deployments, prefer creating the indexes + manually with the SQL above ahead of the code deploy, so that + `AutoMigrate` finds them already in place. +- Column order in the index matches the expected query pattern: most + queries filter on release first (equality), then timestamp (range). + The `prow_job_run_tests` index leads with timestamp because the + materialized view queries filter primarily on timestamp ranges. diff --git a/pkg/db/models/prow.go b/pkg/db/models/prow.go index 5d9cbb6504..db3d607de5 100644 --- a/pkg/db/models/prow.go +++ b/pkg/db/models/prow.go @@ -40,7 +40,7 @@ type ProwJobRun struct { ProwJob ProwJob ProwJobID uint `gorm:"index"` // Used for partitioning - ProwJobRelease string + ProwJobRelease string `gorm:"index:idx_prow_job_runs_release_timestamp"` // Cluster is the cluster where the prow job was run. Cluster string @@ -57,7 +57,7 @@ type ProwJobRun struct { // KnownFailure is true if the job run failed, but we found a bug that is likely related already filed. KnownFailure bool Succeeded bool - Timestamp time.Time `gorm:"index;index:idx_prow_job_runs_timestamp_date,expression:DATE(timestamp AT TIME ZONE 'UTC')"` + Timestamp time.Time `gorm:"index;index:idx_prow_job_runs_timestamp_date,expression:DATE(timestamp AT TIME ZONE 'UTC');index:idx_prow_job_runs_release_timestamp"` Duration time.Duration OverallResult v1.JobOverallResult `gorm:"index"` // Labels stores the IDs of labels applied to this job run @@ -72,10 +72,10 @@ type ProwJobRun struct { // between ProwJobRun and ProwPullRequest. Release and timestamp are denormalized from // ProwJobRun to support future partitioning. type ProwJobRunProwPullRequest struct { - ProwJobRunID uint `gorm:"primaryKey"` - ProwPullRequestID uint `gorm:"primaryKey"` - ProwJobRunRelease string - ProwJobRunTimestamp time.Time + ProwJobRunID uint `gorm:"primaryKey"` + ProwPullRequestID uint `gorm:"primaryKey"` + ProwJobRunRelease string `gorm:"index:idx_prow_job_run_prow_pull_requests_release_timestamp"` + ProwJobRunTimestamp time.Time `gorm:"index:idx_prow_job_run_prow_pull_requests_release_timestamp"` } // ProwJobRunAnnotation stores a single key-value annotation for a ProwJobRun. @@ -84,8 +84,8 @@ type ProwJobRunAnnotation struct { ProwJobRunID uint `gorm:"index;uniqueIndex:idx_prow_job_run_annotations_key"` Key string `gorm:"uniqueIndex:idx_prow_job_run_annotations_key"` Value string - ProwJobRunRelease string - ProwJobRunTimestamp time.Time + ProwJobRunRelease string `gorm:"index:idx_prow_job_run_annotations_release_timestamp"` + ProwJobRunTimestamp time.Time `gorm:"index:idx_prow_job_run_annotations_release_timestamp"` } type Test struct { @@ -103,12 +103,12 @@ type ProwJobRunTest struct { ProwJobRun ProwJobRun // used for variants // skips joining on ProwJobRunID just to get ProwJobID - ProwJobID uint + ProwJobID uint `gorm:"index"` // used for partitioning - ProwJobRunTimestamp time.Time + ProwJobRunTimestamp time.Time `gorm:"index:idx_prow_job_run_tests_release_timestamp"` // used for partitioning - ProwJobRunRelease string - TestID uint `gorm:"index;index:idx_prow_job_run_tests_test_id_status"` + ProwJobRunRelease string `gorm:"index:idx_prow_job_run_tests_release_timestamp"` + TestID uint `gorm:"index;index:idx_prow_job_run_tests_test_id_status"` Test Test // SuiteID may be nil if no suite name could be parsed from the testgrid test name. SuiteID *uint `gorm:"index"` @@ -129,9 +129,9 @@ type ProwJobRunTestOutput struct { // Output stores the output of a ProwJobRunTest. Output string // used for partitioning - ProwJobRunTestTimestamp time.Time + ProwJobRunTestTimestamp time.Time `gorm:"index:idx_prow_job_run_test_outputs_release_timestamp"` // used for partitioning - ProwJobRunTestRelease string + ProwJobRunTestRelease string `gorm:"index:idx_prow_job_run_test_outputs_release_timestamp"` } // Suite defines a junit testsuite. Used to differentiate the same test being run in different suites in ProwJobRunTest.