From d703209c429d7e6a9228e4a7b240c8762fb84bac Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 6 May 2026 02:19:49 +0000 Subject: [PATCH] Add backfill support to SMResourceOptions and fix topology-sensitive tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `backfill` field to SMResourceOptions. When True, sets CU_DEV_SM_RESOURCE_GROUP_BACKFILL on each group's flags, allowing the driver to relax the co-scheduling constraint when assigning SMs to groups. This enables requesting arbitrary aligned SM counts that the driver would otherwise reject due to hardware topology constraints. Fix test_two_groups and related tests: replace _aligned_half (which computed half the SMs rounded to alignment — not always a valid partition on all GPU topologies) with _safe_two_group_count (uses min_partition_size, always valid). Add dedicated test_two_groups_backfill that exercises the aggressive even-split with backfill=True. Fixes #2025. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_device_resources.pyx | 19 +++++++- cuda_core/tests/test_green_context.py | 56 +++++++++++++---------- 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index e4851a73a41..40c0a874d05 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -106,11 +106,18 @@ cdef class SMResourceOptions: Preferred co-scheduled SM count; the driver tries to satisfy this but may fall back to ``coscheduled_sm_count``. (Default to ``None``) + backfill : bool or Sequence[bool], optional + If ``True``, allow the driver to relax the co-scheduling + constraint when assigning SMs. This enables requesting + arbitrary aligned SM counts that the driver would otherwise + reject due to hardware topology constraints. + (Default to ``False``) """ count: int | SequenceABC | None = None coscheduled_sm_count: int | SequenceABC | None = None preferred_coscheduled_sm_count: int | SequenceABC | None = None + backfill: bool | SequenceABC = False @dataclass @@ -172,6 +179,12 @@ cdef inline int _resolve_group_count(SMResourceOptions options) except?-1: n_groups, count_is_scalar, ) + _validate_split_field_length( + options.backfill, + "backfill", + n_groups, + count_is_scalar, + ) return n_groups @@ -243,6 +256,7 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: cdef list counts = _broadcast_field(options.count, n_groups) cdef list coscheduled = _broadcast_field(options.coscheduled_sm_count, n_groups) cdef list preferred = _broadcast_field(options.preferred_coscheduled_sm_count, n_groups) + cdef list backfills = _broadcast_field(options.backfill, n_groups) cdef int i for i in range(n_groups): @@ -252,7 +266,10 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: params[i].coscheduledSmCount = (coscheduled[i]) if preferred[i] is not None: params[i].preferredCoscheduledSmCount = (preferred[i]) - params[i].flags = 0 + params[i].flags = ( + cydriver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_BACKFILL + if backfills[i] else 0 + ) return 0 diff --git a/cuda_core/tests/test_green_context.py b/cuda_core/tests/test_green_context.py index 8eb32f7c1aa..4d89c8d76f7 100644 --- a/cuda_core/tests/test_green_context.py +++ b/cuda_core/tests/test_green_context.py @@ -82,11 +82,16 @@ def fill_kernel(init_cuda): return mod.get_kernel("fill") -def _aligned_half(sm): - """Compute half the SM count, rounded down to min_partition_size alignment.""" +def _safe_two_group_count(sm): + """Return a safe per-group SM count for a 2-group split. + + Uses min_partition_size which is always a valid split size regardless + of hardware topology. Returns None if the device doesn't have enough SMs. + """ min_size = sm.min_partition_size - half = (sm.sm_count // 2 // min_size) * min_size - return half + if sm.sm_count < 2 * min_size: + return None + return min_size @contextlib.contextmanager @@ -238,30 +243,33 @@ def test_discovery_respects_alignment(self, sm_resource): assert groups[0].sm_count % sm_resource.coscheduled_alignment == 0 def test_two_groups(self, sm_resource): - """Two-group split with explicit aligned counts.""" - half = _aligned_half(sm_resource) - if half < sm_resource.min_partition_size: + """Two-group split with min_partition_size (always topology-safe).""" + count = _safe_two_group_count(sm_resource) + if count is None: pytest.skip("Not enough SMs for a 2-group split") - groups, rem = sm_resource.split(SMResourceOptions(count=(half, half))) + groups, rem = sm_resource.split(SMResourceOptions(count=(count, count))) assert len(groups) == 2 - assert groups[0].sm_count > 0 - assert groups[1].sm_count > 0 + assert groups[0].sm_count >= count + assert groups[1].sm_count >= count total = groups[0].sm_count + groups[1].sm_count + rem.sm_count assert total <= sm_resource.sm_count - def test_two_groups_each_meets_request(self, sm_resource): - min_size = sm_resource.min_partition_size - half = _aligned_half(sm_resource) - if half < min_size: - pytest.skip("Not enough SMs for a 2-group split") + def test_two_groups_backfill(self, sm_resource): + """Two-group split with backfill allows larger partitions.""" + align = sm_resource.coscheduled_alignment + if align == 0: + align = sm_resource.min_partition_size + half = (sm_resource.sm_count // 2 // align) * align + if half < sm_resource.min_partition_size: + pytest.skip("Not enough SMs for a 2-group backfill split") - groups, _ = sm_resource.split(SMResourceOptions(count=(min_size, min_size))) + groups, rem = sm_resource.split(SMResourceOptions(count=(half, half), backfill=True)) assert len(groups) == 2 - assert groups[0].sm_count >= min_size - assert groups[1].sm_count >= min_size + assert groups[0].sm_count >= half + assert groups[1].sm_count >= half def test_dry_run_matches_real(self, sm_resource): """Dry-run reports the same SM counts as a real split.""" @@ -352,11 +360,11 @@ def test_green_ctx_sm_resources(self, green_ctx, sm_resource): def test_green_ctx_resources_reflect_partition(self, init_cuda, sm_resource): """Two green contexts should have disjoint SM partitions.""" - half = _aligned_half(sm_resource) - if half < sm_resource.min_partition_size: + count = _safe_two_group_count(sm_resource) + if count is None: pytest.skip("Not enough SMs for a 2-group split") - groups, _ = sm_resource.split(SMResourceOptions(count=(half, half))) + groups, _ = sm_resource.split(SMResourceOptions(count=(count, count))) ctx_a = ctx_b = None try: @@ -425,11 +433,11 @@ def test_launch_and_verify(self, init_cuda, green_ctx, fill_kernel): def test_two_green_contexts_independent(self, init_cuda, sm_resource, fill_kernel): """Two SM groups -> two green contexts -> two independent kernels.""" dev = init_cuda - half = _aligned_half(sm_resource) - if half < sm_resource.min_partition_size: + count = _safe_two_group_count(sm_resource) + if count is None: pytest.skip("Not enough SMs for a 2-group split") - groups, _ = sm_resource.split(SMResourceOptions(count=(half, half))) + groups, _ = sm_resource.split(SMResourceOptions(count=(count, count))) assert len(groups) == 2 ctx_a = ctx_b = None