fix: use selectinload for experiment relationships in score set search

bencap · bencap · commit 7e5d5d2c7216 · 2026-03-06T13:45:31.000-08:00
Switch one-to-many experiment relationship loading (keyword_objs,
doi_identifiers, publication_identifier_associations, raw_read_identifiers)
from joinedload to selectinload inside the contains_eager block. This
prevents row multiplication from causing the SQL LIMIT to apply to
multiplied rows rather than unique score sets, which resulted in search
returning fewer results than expected on databases with rich experiment
metadata.
diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py
@@ -238,18 +238,26 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
     score_sets: list[ScoreSet] = (
         query.join(ScoreSet.experiment)
         .options(
+            # Use selectinload for one-to-many experiment relationships to avoid row
+            # multiplication in the main query. joinedload would LEFT OUTER JOIN these
+            # into the main SQL query, and because they're nested inside contains_eager,
+            # SQLAlchemy's subquery-wrapping logic doesn't protect the LIMIT clause from
+            # being applied to multiplied rows rather than unique score sets. This would
+            # cause the count of returned score sets to be less than the requested limit,
+            # and the count query would be triggered even when the number of unique score
+            # sets in the main query results exceeds the limit.
             contains_eager(ScoreSet.experiment).options(
                 joinedload(Experiment.experiment_set),
-                joinedload(Experiment.keyword_objs).joinedload(
+                selectinload(Experiment.keyword_objs).joinedload(
                     ExperimentControlledKeywordAssociation.controlled_keyword
                 ),
                 joinedload(Experiment.created_by),
                 joinedload(Experiment.modified_by),
-                joinedload(Experiment.doi_identifiers),
-                joinedload(Experiment.publication_identifier_associations).joinedload(
+                selectinload(Experiment.doi_identifiers),
+                selectinload(Experiment.publication_identifier_associations).joinedload(
                     ExperimentPublicationIdentifierAssociation.publication
                 ),
-                joinedload(Experiment.raw_read_identifiers),
+                selectinload(Experiment.raw_read_identifiers),
                 selectinload(Experiment.score_sets).options(
                     joinedload(ScoreSet.doi_identifiers),
                     joinedload(ScoreSet.publication_identifier_associations).joinedload(
diff --git a/tests/routers/test_score_set.py b/tests/routers/test_score_set.py
@@ -2403,6 +2403,53 @@ def test_search_filter_options_hidden_by_published_superseding_version(
     assert target_name not in target_names
 
 
+def test_search_score_sets_reports_correct_total_count_with_limit(
+    session, data_provider, client, setup_router_db, data_files
+):
+    """When more published score sets exist than the search limit, num_score_sets should reflect the true total."""
+    num_score_sets = 3
+    for i in range(num_score_sets):
+        experiment = create_experiment(client, {"title": f"Experiment {i}"})
+        score_set = create_seq_score_set(client, experiment["urn"], update={"title": f"Score Set {i}"})
+        score_set = mock_worker_variant_insertion(client, session, data_provider, score_set, data_files / "scores.csv")
+
+        with patch.object(arq.ArqRedis, "enqueue_job", return_value=None):
+            publish_score_set(client, score_set["urn"])
+
+    search_payload = {"limit": 2}
+    response = client.post("/api/v1/score-sets/search", json=search_payload)
+    assert response.status_code == 200
+    assert len(response.json()["scoreSets"]) == 2
+    assert response.json()["numScoreSets"] == num_score_sets
+
+
+def test_search_score_sets_not_affected_by_experiment_metadata(
+    session, data_provider, client, setup_router_db, data_files
+):
+    """Experiments with multiple keywords should not reduce the number of score sets returned by search.
+
+    This is a regression test for a bug where joinedload on one-to-many experiment relationships caused row
+    multiplication in the main SQL query. The LIMIT clause was applied to the multiplied rows rather than unique
+    score sets, resulting in fewer results than expected.
+    """
+    from tests.helpers.constants import TEST_EXPERIMENT_WITH_KEYWORD
+
+    num_score_sets = 3
+    for i in range(num_score_sets):
+        experiment = create_experiment(client, {**TEST_EXPERIMENT_WITH_KEYWORD, "title": f"Experiment {i}"})
+        score_set = create_seq_score_set(client, experiment["urn"], update={"title": f"Score Set {i}"})
+        score_set = mock_worker_variant_insertion(client, session, data_provider, score_set, data_files / "scores.csv")
+
+        with patch.object(arq.ArqRedis, "enqueue_job", return_value=None):
+            publish_score_set(client, score_set["urn"])
+
+    search_payload = {"limit": 2}
+    response = client.post("/api/v1/score-sets/search", json=search_payload)
+    assert response.status_code == 200
+    assert len(response.json()["scoreSets"]) == 2
+    assert response.json()["numScoreSets"] == num_score_sets
+
+
 ########################################################################################################################
 # Score set deletion
 ########################################################################################################################