Skip to content

Commit 7e5d5d2

Browse files
committed
fix: use selectinload for experiment relationships in score set search
Switch one-to-many experiment relationship loading (keyword_objs, doi_identifiers, publication_identifier_associations, raw_read_identifiers) from joinedload to selectinload inside the contains_eager block. This prevents row multiplication from causing the SQL LIMIT to apply to multiplied rows rather than unique score sets, which resulted in search returning fewer results than expected on databases with rich experiment metadata.
1 parent 95fab85 commit 7e5d5d2

2 files changed

Lines changed: 59 additions & 4 deletions

File tree

src/mavedb/lib/score_sets.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,18 +238,26 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
238238
score_sets: list[ScoreSet] = (
239239
query.join(ScoreSet.experiment)
240240
.options(
241+
# Use selectinload for one-to-many experiment relationships to avoid row
242+
# multiplication in the main query. joinedload would LEFT OUTER JOIN these
243+
# into the main SQL query, and because they're nested inside contains_eager,
244+
# SQLAlchemy's subquery-wrapping logic doesn't protect the LIMIT clause from
245+
# being applied to multiplied rows rather than unique score sets. This would
246+
# cause the count of returned score sets to be less than the requested limit,
247+
# and the count query would be triggered even when the number of unique score
248+
# sets in the main query results exceeds the limit.
241249
contains_eager(ScoreSet.experiment).options(
242250
joinedload(Experiment.experiment_set),
243-
joinedload(Experiment.keyword_objs).joinedload(
251+
selectinload(Experiment.keyword_objs).joinedload(
244252
ExperimentControlledKeywordAssociation.controlled_keyword
245253
),
246254
joinedload(Experiment.created_by),
247255
joinedload(Experiment.modified_by),
248-
joinedload(Experiment.doi_identifiers),
249-
joinedload(Experiment.publication_identifier_associations).joinedload(
256+
selectinload(Experiment.doi_identifiers),
257+
selectinload(Experiment.publication_identifier_associations).joinedload(
250258
ExperimentPublicationIdentifierAssociation.publication
251259
),
252-
joinedload(Experiment.raw_read_identifiers),
260+
selectinload(Experiment.raw_read_identifiers),
253261
selectinload(Experiment.score_sets).options(
254262
joinedload(ScoreSet.doi_identifiers),
255263
joinedload(ScoreSet.publication_identifier_associations).joinedload(

tests/routers/test_score_set.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2403,6 +2403,53 @@ def test_search_filter_options_hidden_by_published_superseding_version(
24032403
assert target_name not in target_names
24042404

24052405

2406+
def test_search_score_sets_reports_correct_total_count_with_limit(
2407+
session, data_provider, client, setup_router_db, data_files
2408+
):
2409+
"""When more published score sets exist than the search limit, num_score_sets should reflect the true total."""
2410+
num_score_sets = 3
2411+
for i in range(num_score_sets):
2412+
experiment = create_experiment(client, {"title": f"Experiment {i}"})
2413+
score_set = create_seq_score_set(client, experiment["urn"], update={"title": f"Score Set {i}"})
2414+
score_set = mock_worker_variant_insertion(client, session, data_provider, score_set, data_files / "scores.csv")
2415+
2416+
with patch.object(arq.ArqRedis, "enqueue_job", return_value=None):
2417+
publish_score_set(client, score_set["urn"])
2418+
2419+
search_payload = {"limit": 2}
2420+
response = client.post("/api/v1/score-sets/search", json=search_payload)
2421+
assert response.status_code == 200
2422+
assert len(response.json()["scoreSets"]) == 2
2423+
assert response.json()["numScoreSets"] == num_score_sets
2424+
2425+
2426+
def test_search_score_sets_not_affected_by_experiment_metadata(
2427+
session, data_provider, client, setup_router_db, data_files
2428+
):
2429+
"""Experiments with multiple keywords should not reduce the number of score sets returned by search.
2430+
2431+
This is a regression test for a bug where joinedload on one-to-many experiment relationships caused row
2432+
multiplication in the main SQL query. The LIMIT clause was applied to the multiplied rows rather than unique
2433+
score sets, resulting in fewer results than expected.
2434+
"""
2435+
from tests.helpers.constants import TEST_EXPERIMENT_WITH_KEYWORD
2436+
2437+
num_score_sets = 3
2438+
for i in range(num_score_sets):
2439+
experiment = create_experiment(client, {**TEST_EXPERIMENT_WITH_KEYWORD, "title": f"Experiment {i}"})
2440+
score_set = create_seq_score_set(client, experiment["urn"], update={"title": f"Score Set {i}"})
2441+
score_set = mock_worker_variant_insertion(client, session, data_provider, score_set, data_files / "scores.csv")
2442+
2443+
with patch.object(arq.ArqRedis, "enqueue_job", return_value=None):
2444+
publish_score_set(client, score_set["urn"])
2445+
2446+
search_payload = {"limit": 2}
2447+
response = client.post("/api/v1/score-sets/search", json=search_payload)
2448+
assert response.status_code == 200
2449+
assert len(response.json()["scoreSets"]) == 2
2450+
assert response.json()["numScoreSets"] == num_score_sets
2451+
2452+
24062453
########################################################################################################################
24072454
# Score set deletion
24082455
########################################################################################################################

0 commit comments

Comments
 (0)