Conversation
| /// parent is not current). Returns (rows_deleted, next_marker). | ||
| fn delete_orphaned_sitrep_metadata_query( | ||
| marker: Uuid, | ||
| batch_size: std::num::NonZeroU32, |
There was a problem hiding this comment.
If we don't paginate, this incurs a full table scan. So: We have to paginate.
There was a problem hiding this comment.
nit, take it or leave it: it might be kinda nice if the background task's status thingy was able to indicate how many of these it had to do (and also the batch size?)
There was a problem hiding this comment.
Done (plumbed batch info through bg task, out to omdb) in 076920b
| .transaction(&conn, |conn| { | ||
| async move { | ||
| // Step 1: Delete orphaned fm_sitrep metadata rows. | ||
| let mut sitreps_deleted = 0usize; |
There was a problem hiding this comment.
So, the structure here is basically:
- Delete sitrep metadata rows (make orphaned rows)
- Clean up orphaned rows (which maybe we just made, or maybe got created by an errant INSERT operation that bailed out)
We could make this transaction smaller by "limiting how many sitrep metadata rows we create at once".
For example, we could:
- "Only create up to 64 orphans in a txn"
- ... then clean up all their data, within this transaction (to avoid torn reads)
- ... then, COMMIT the transaction, and do another batch, until we stop cleaning stuff up / hit some arbitrary limit / etc
Thank you for your support in this matter :D |
hawkw
left a comment
There was a problem hiding this comment.
this code is kinda wild, i'm impressed --- thank you for humoring me
| pub const fn table_name(&self) -> &'static str { | ||
| match self { | ||
| Self::CaseEreport => "fm_ereport_in_case", | ||
| Self::AlertRequest => "fm_alert_request", | ||
| Self::Case => "fm_case", | ||
| } | ||
| } | ||
|
|
||
| const fn sitrep_id_column(&self) -> &'static str { | ||
| match self { | ||
| Self::CaseEreport => "sitrep_id", | ||
| Self::AlertRequest => "sitrep_id", | ||
| Self::Case => "sitrep_id", | ||
| } | ||
| } |
There was a problem hiding this comment.
...this is how you know the queries you're about to read are gonna be Really Cool, huh?
There was a problem hiding this comment.
So the one really big concern here is that if you want to have your new sitrep child table --- such as (for example) the alert requests that @mergeconflict is working on --- you gotta remember to add it here or else your new thing won't be GCed. Which, once I actually think about it at all, I realize was already the case, since you would have to write queries for deleting your new thing, so this doesn't actually make the status quo any worse, and in fact, kinda makes it easier, since all you have to do is add your new table here.
With that said, I wonder if we ought to have some kinda test that asks CRDB for all the table/column names and does a big panic!("HEY DID YOU FORGET TO ADD YOUR NEW TABLE TO THE SITREP GC CODE?") if it encounters a table with a name like fm_* with a sitrep_id column, or something...?
There was a problem hiding this comment.
We have almost an identical problem in reconfigurator-land for blueprints. And there is a test to catch this! I'll see if I can pull it over and make it work for sitreps too -- we did prefix matching there.
There was a problem hiding this comment.
Added a test to catch these cases in a88fd11
| // Step 2: For each child table, paginate through and | ||
| // delete rows whose sitrep_id no longer exists in | ||
| // fm_sitrep. | ||
| let mut cases_deleted = 0usize; | ||
| let mut case_ereports_deleted = 0usize; | ||
| let mut alert_requests_deleted = 0usize; | ||
|
|
||
| for (table, counter) in [ | ||
| ( | ||
| SitrepChildTable::CaseEreport, | ||
| &mut case_ereports_deleted, | ||
| ), | ||
| ( | ||
| SitrepChildTable::AlertRequest, | ||
| &mut alert_requests_deleted, | ||
| ), | ||
| (SitrepChildTable::Case, &mut cases_deleted), | ||
| ] { |
There was a problem hiding this comment.
it occurs to me that, if this gets somewhat unwieldy as we add more tables, this could maybe just be a map of (SitrepChildTable, usize). Although I do kinda not love allocating a whole hashmap for that... 🤷♀️
| let result = | ||
| Self::delete_orphaned_sitrep_metadata_query( | ||
| marker, | ||
| SQL_BATCH_SIZE, | ||
| ) | ||
| .get_result_async::<(i64, Option<Uuid>)>(&conn) | ||
| .await?; | ||
|
|
||
| let (deleted, next_marker) = result; |
There was a problem hiding this comment.
nit: mayhaps
| let result = | |
| Self::delete_orphaned_sitrep_metadata_query( | |
| marker, | |
| SQL_BATCH_SIZE, | |
| ) | |
| .get_result_async::<(i64, Option<Uuid>)>(&conn) | |
| .await?; | |
| let (deleted, next_marker) = result; | |
| let (deleted, next_marker) = | |
| Self::delete_orphaned_sitrep_metadata_query( | |
| marker, | |
| SQL_BATCH_SIZE, | |
| ) | |
| .get_result_async::<(i64, Option<Uuid>)>(&conn) | |
| .await?; |
| @@ -998,6 +913,244 @@ impl DataStore { | |||
| Ok(sitreps_deleted) | |||
| } | |||
There was a problem hiding this comment.
there are some tests which still use it, and transitioning them over is still a bit hairy - I can mark it as cfg(test) though and make it non-pub?
| /// parent is not current). Returns (rows_deleted, next_marker). | ||
| fn delete_orphaned_sitrep_metadata_query( | ||
| marker: Uuid, | ||
| batch_size: std::num::NonZeroU32, |
There was a problem hiding this comment.
nit, take it or leave it: it might be kinda nice if the background task's status thingy was able to indicate how many of these it had to do (and also the batch size?)
| ) -> Result<GcOrphansResult, Error> { | ||
| let conn = self.pool_connection_authorized(opctx).await?; | ||
|
|
||
| // TODO(eliza): there should probably be an authz object for the fm sitrep? |
| /// All child tables. Used by the GC loop and by tests to ensure | ||
| /// completeness. If you add a new variant, add it here — the GC | ||
| /// loop will fail to compile until you handle it. | ||
| pub const ALL: &[SitrepChildTable] = | ||
| &[Self::CaseEreport, Self::AlertRequest, Self::Case]; |
There was a problem hiding this comment.
can this be accomplished using #[derive(strum::VariantArray)] or whatever? or is it specifically intended to be the thing that catches variants you forgot to add?
| /// Identifies a child table of `fm_sitrep` for use in orphan-deletion | ||
| /// queries. Using an enum (rather than raw strings) ensures the table and | ||
| /// column names are known at compile time, preventing SQL injection. | ||
| #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | ||
| pub enum SitrepChildTable { | ||
| CaseEreport, | ||
| AlertRequest, | ||
| Case, | ||
| } | ||
|
|
||
| impl SitrepChildTable { | ||
| /// All child tables. Used by the GC loop and by tests to ensure | ||
| /// completeness. If you add a new variant, add it here — the GC | ||
| /// loop will fail to compile until you handle it. | ||
| pub const ALL: &[SitrepChildTable] = | ||
| &[Self::CaseEreport, Self::AlertRequest, Self::Case]; | ||
| } | ||
|
|
||
| impl SitrepChildTable { | ||
| pub const fn table_name(&self) -> &'static str { | ||
| match self { | ||
| Self::CaseEreport => "fm_ereport_in_case", | ||
| Self::AlertRequest => "fm_alert_request", | ||
| Self::Case => "fm_case", | ||
| } | ||
| } | ||
|
|
||
| pub(crate) const fn sitrep_id_column(&self) -> &'static str { | ||
| match self { | ||
| Self::CaseEreport => "sitrep_id", | ||
| Self::AlertRequest => "sitrep_id", | ||
| Self::Case => "sitrep_id", | ||
| } | ||
| } |
There was a problem hiding this comment.
don't feel like you have to figure this out now, but i kinda wonder how easy it would be to make the developer experience of adding a new table to the GC targets be just adding one line like
// ...
MyCoolTable, "fm_my_cool_table", "sitrep_id",
// ...to something? i bet we could rig it up so that there was a macro that basically just generated everything else, since it's so repetitive...
Fixes #10131
In an attempt to avoid using transactions during the INSERT pathway (see: #10133)
this PR instead tries to alter how DELETE works for sitreps.
Rather than using a two-step GC process (listing orphaned sitreps and deleting
them - even while concurrent INSERT operations could be underway) this PR
also enables the deletion of "rows within a sitrep, for which the sitrep metadata
record no longer exists".
This allows us to delete rows that were previously leaked, and is also now the
primary mechanism by which rows are removed: we delete the high-level
sitrep first, then remove all these newly-orphaned rows. This is still performed
within a transaction on the DELETE pathway to avoid reading torn sitreps
during the loading phase (as originally appeared in #9594).