From 31bf0bc7a2d97a561f9a675efc01ee610fb85e6d Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Wed, 1 Apr 2026 14:19:17 -0400 Subject: [PATCH 01/22] feat: initial changes to allow for more than one resource --- docs/resource-selection.md | 242 ++++++++++++++++ src/example_config.json | 21 +- src/tests/services/__init__.py | 0 src/tests/services/test_resource_selector.py | 219 +++++++++++++++ .../schemas/resources/LocalResource.json | 84 +++++- .../schemas/resources/SlurmAPIResource.json | 103 ++++++- .../services/__init__.py | 0 .../services/resource_selector.py | 258 ++++++++++++++++++ .../views/workspace_view.py | 37 ++- 9 files changed, 946 insertions(+), 18 deletions(-) create mode 100644 docs/resource-selection.md create mode 100644 src/tests/services/__init__.py create mode 100644 src/tests/services/test_resource_selector.py create mode 100644 src/user_workspaces_server/services/__init__.py create mode 100644 src/user_workspaces_server/services/resource_selector.py diff --git a/docs/resource-selection.md b/docs/resource-selection.md new file mode 100644 index 00000000..737c9c26 --- /dev/null +++ b/docs/resource-selection.md @@ -0,0 +1,242 @@ +# Intelligent Resource Selection + +When a job is submitted, the server automatically selects the most appropriate compute resource rather than routing everything to a single hardcoded `main_resource`. Selection runs in two phases: filter and score. + +--- + +## How It Works + +### Phase 1 — Filter (hard constraints) + +Resources that cannot fulfil the job are eliminated: + +| Check | Resource config field | +|---|---| +| GPU required but resource has none | `capabilities.gpu_enabled` | +| Requested CPUs exceed limit | `capabilities.max_cpus` | +| Requested memory (MB) exceeds limit | `capabilities.max_memory_mb` | +| Requested time (minutes) exceeds limit | `capabilities.max_time_minutes` | +| User lacks permission on this resource | `resource_user_authentication.has_permission()` | +| Health check fails | `connection_details.health_check_url` | + +Resources that do not have a `capabilities` section in their config are skipped entirely and never considered by the selector (they remain available as the `main_resource` fallback). + +### Phase 2 — Score (soft preferences) + +Each eligible resource receives a score and the highest scorer is selected: + +``` +score = (priority × 10) + + (50 if job_type in preferred_for_job_types) + + (30 if GPU job on GPU resource) + + (20 if CPU job on CPU-only resource) + - (cost_per_core_hour × 5) + - (utilization_fraction × 20) +``` + +`utilization_fraction` = active jobs / `capabilities.max_concurrent_jobs`, capped at 1.0. Omit `max_concurrent_jobs` from a resource's capabilities to disable utilization scoring for that resource. + +### Fallback + +If no resource passes the filter (e.g. a GPU job on a CPU-only deployment, or impossible resource limits), the server falls back to `main_resource` and logs a warning. No error is raised. + +--- + +## Configuration + +Add three optional sections to any resource in `available_resources`. Resources without these sections are ignored by the selector. + +### `capabilities` + +Hard limits used for filtering. Units match the resource option parameter names directly — no conversion needed. **Required fields** when the section is present: `gpu_enabled`, `max_cpus`, `max_memory_mb`, `max_time_minutes`. + +```json +"capabilities": { + "gpu_enabled": true, + "gpu_types": ["A100", "V100"], + "max_cpus": 128, + "max_memory_mb": 524288, + "max_time_minutes": 2880, + "max_gpus": 8, + "max_concurrent_jobs": 50, + "partitions": ["GPU", "GPU-shared"] +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `gpu_enabled` | boolean | yes | Whether the resource has GPU support | +| `max_cpus` | integer | yes | Maximum CPUs per job (matches `num_cpus`) | +| `max_memory_mb` | integer | yes | Maximum memory per job in MB (matches `memory_mb`) | +| `max_time_minutes` | integer | yes | Maximum walltime per job in minutes (matches `time_limit_min`) | +| `gpu_types` | array[string] | no | Available GPU models | +| `max_gpus` | integer | no | Maximum GPUs per job (default: 0) | +| `max_concurrent_jobs` | integer | no | Soft job ceiling for utilization scoring. Omit to skip utilization scoring. | +| `partitions` | array[string] | no | Available Slurm partitions | + +### `selection_criteria` + +Scoring hints. **Required field**: `priority`. + +```json +"selection_criteria": { + "priority": 10, + "cost_per_core_hour": 1.5, + "preferred_for_job_types": ["jupyter_lab"] +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `priority` | integer (0–100) | yes | Base score multiplier. Higher = preferred when all else is equal. | +| `cost_per_core_hour` | float | no | Relative cost metric. Lower = preferred. (default: 1.0) | +| `preferred_for_job_types` | array[string] | no | Job type keys that earn a +50 bonus on this resource. | + +> **Note:** User/group access control is enforced through the resource's `user_authentication` controller (e.g. `GlobusUserAuthentication`), not through `selection_criteria`. Configure `allowed_globus_groups` on the auth controller itself. + +--- + +## Multi-Resource Example + +A deployment with a GPU cluster and a CPU cluster: + +```json +"available_resources": { + "hive_gpu_cluster": { + "name": "Hive GPU Cluster", + "resource_type": "SlurmAPIResource", + "storage": "hubmap_local_fs", + "user_authentication": "globus_auth", + "connection_details": { + "root_url": "https://slurm-gpu.hive.psc.edu/proxy", + "api_token": "...", + "health_check_url": "https://slurm-gpu.hive.psc.edu/health" + }, + "cpu_partition": "GPU-shared", + "gpu_partition": "GPU", + "capabilities": { + "gpu_enabled": true, + "gpu_types": ["A100"], + "max_cpus": 128, + "max_memory_mb": 524288, + "max_time_minutes": 2880, + "max_gpus": 8, + "max_concurrent_jobs": 50 + }, + "selection_criteria": { + "priority": 10, + "cost_per_core_hour": 1.5, + "preferred_for_job_types": ["jupyter_lab"] + } + }, + + "hive_cpu_cluster": { + "name": "Hive CPU Cluster", + "resource_type": "SlurmAPIResource", + "storage": "hubmap_local_fs", + "user_authentication": "globus_auth", + "connection_details": { + "root_url": "https://slurm-cpu.hive.psc.edu/proxy", + "api_token": "...", + "health_check_url": "https://slurm-cpu.hive.psc.edu/health" + }, + "cpu_partition": "RM", + "capabilities": { + "gpu_enabled": false, + "max_cpus": 64, + "max_memory_mb": 262144, + "max_time_minutes": 10080, + "max_gpus": 0, + "max_concurrent_jobs": 100 + }, + "selection_criteria": { + "priority": 5, + "cost_per_core_hour": 0.8, + "preferred_for_job_types": ["jupyter_lab"] + } + } +} +``` + +With this config: +- A GPU job is routed to `hive_gpu_cluster` (CPU cluster filtered out). +- A CPU job can go to either cluster — GPU cluster scores higher due to priority, but CPU cluster scores an efficiency bonus (+20). Whether they load-balance depends on how close the scores are at runtime. +- If both clusters are unhealthy or over capacity, the job falls back to `main_resource`. + +### `environment_details` for multi-resource job types + +When multiple resources are configured, each job type's `environment_details` can provide per-resource environment config. The server uses the selected resource's key first, then falls back to the `main_resource` key: + +```json +"available_job_types": { + "jupyter_lab": { + "name": "Jupyter Lab", + "job_type": "JupyterLabJob", + "environment_details": { + "hive_gpu_cluster": { + "python_version": "python3.10", + "module_manager": "virtualenv", + "modules": ["jupyterlab"], + "time_limit": "60", + "environment_name": "JupyterLabJob" + }, + "hive_cpu_cluster": { + "python_version": "python3.10", + "module_manager": "virtualenv", + "modules": ["jupyterlab"], + "time_limit": "60", + "environment_name": "JupyterLabJob" + }, + "main_resource": { + "python_version": "python3.10", + "module_manager": "virtualenv", + "modules": ["jupyterlab"], + "time_limit": "60", + "environment_name": "JupyterLabJob" + } + } + } +} +``` + +--- + +## Access Control + +Resource-level access control is handled by the resource's `user_authentication` controller, not by the selector config. When a user submits a job, the selector calls `has_permission(user)` on each candidate resource's auth controller. Resources for which the user lacks permission are filtered out. + +For `GlobusUserAuthentication`, this means: +- The user must have a stored external mapping (i.e. must have authenticated at least once). +- If `allowed_globus_groups` is set on the auth controller, the user's stored groups token is checked against those groups. + +To restrict a resource to a specific group, configure `allowed_globus_groups` on the auth controller assigned to that resource: + +```json +"available_user_authentication": { + "gpu_cluster_auth": { + "user_authentication_type": "GlobusUserAuthentication", + "connection_details": { + "client_id": "...", + "client_secret": "...", + "authentication_type": "token", + "allowed_globus_groups": [""] + } + } +}, +"available_resources": { + "hive_gpu_cluster": { + "user_authentication": "gpu_cluster_auth", + ... + } +} +``` + +> **Known limitation:** `has_permission` is currently a placeholder that always returns `True`. Real group membership enforcement is pending Phase 4 implementation in `GlobusUserAuthentication`. + +--- + +## Backward Compatibility + +- Existing deployments with no `capabilities` section continue to work unchanged — those resources are skipped by the selector and the `main_resource` fallback is used. +- Adding `capabilities` to a resource opts it into the selector without any other changes required. +- The `main_resource` key in `config.json` still determines the fallback resource. diff --git a/src/example_config.json b/src/example_config.json index d086edfe..93049dc7 100644 --- a/src/example_config.json +++ b/src/example_config.json @@ -31,10 +31,23 @@ "passthrough_domain": "127.0.0.1:8000", "connection_details": {}, "parameter_mapping": { - "num_cpus": "cpus_per_task", - "memory_mb": "memory_per_node", - "time_limit_min": "time_limit" - } + "num_cpus": "cpus_per_task", + "memory_mb": "memory_per_node", + "time_limit_min": "time_limit" + }, + "capabilities": { + "gpu_enabled": false, + "max_cpus": 4, + "max_memory_mb": 8192, + "max_time_minutes": 480, + "max_gpus": 0, + "max_concurrent_jobs": 10 + }, + "selection_criteria": { + "priority": 1, + "cost_per_core_hour": 0.0, + "preferred_for_job_types": ["local_test_job", "jupyter_lab"] + } } }, "available_job_types": { diff --git a/src/tests/services/__init__.py b/src/tests/services/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/tests/services/test_resource_selector.py b/src/tests/services/test_resource_selector.py new file mode 100644 index 00000000..c48df5ac --- /dev/null +++ b/src/tests/services/test_resource_selector.py @@ -0,0 +1,219 @@ +from unittest.mock import MagicMock, patch + +from django.test import TestCase + +from user_workspaces_server.services.resource_selector import ( + LoadTracker, + ResourceSelector, +) + + +def make_resource( + gpu_enabled=False, + max_cpus=64, + max_memory_mb=262144, # 256 GB in MB + max_time_minutes=10080, # 7 days in minutes + max_gpus=0, + max_concurrent_jobs=100, + priority=5, + cost=1.0, + preferred_for=None, + has_health_check_url=False, +): + """Return a mock AbstractResource with the given capabilities.""" + resource = MagicMock() + resource.config = { + "capabilities": { + "gpu_enabled": gpu_enabled, + "max_cpus": max_cpus, + "max_memory_mb": max_memory_mb, + "max_time_minutes": max_time_minutes, + "max_gpus": max_gpus, + "max_concurrent_jobs": max_concurrent_jobs, + }, + "selection_criteria": { + "priority": priority, + "cost_per_core_hour": cost, + "preferred_for_job_types": preferred_for or [], + }, + "connection_details": { + "health_check_url": "http://example.com/health" if has_health_check_url else "" + }, + } + return resource + + +def make_two_resource_dict(): + gpu = make_resource( + gpu_enabled=True, max_cpus=128, max_memory_mb=524288, max_time_minutes=2880, + priority=10, cost=1.5, preferred_for=["jupyter_lab"], + ) + cpu = make_resource( + gpu_enabled=False, max_cpus=64, max_memory_mb=262144, max_time_minutes=10080, + priority=5, cost=0.8, preferred_for=["jupyter_lab"], + ) + return {"gpu_cluster": gpu, "cpu_cluster": cpu} + + +BASE_OPTIONS = {"gpu_enabled": False, "num_cpus": 1, "memory_mb": 0, "time_limit_min": 0} + + +class TestFilterEligibleResources(TestCase): + + def setUp(self): + self.available = make_two_resource_dict() + self.selector = ResourceSelector(self.available) + self.user = MagicMock() + + def test_gpu_job_filters_cpu_only_resource(self): + opts = {**BASE_OPTIONS, "gpu_enabled": True, "num_cpus": 4} + eligible = self.selector._filter_eligible_resources("jupyter_lab", opts, self.user) + names = [name for name, _ in eligible] + self.assertIn("gpu_cluster", names) + self.assertNotIn("cpu_cluster", names) + + def test_cpu_job_passes_both_resources(self): + eligible = self.selector._filter_eligible_resources("jupyter_lab", BASE_OPTIONS, self.user) + names = [name for name, _ in eligible] + self.assertIn("gpu_cluster", names) + self.assertIn("cpu_cluster", names) + + def test_filter_cpu_capacity(self): + # cpu_cluster max_cpus=64; request 128 should exclude it + opts = {**BASE_OPTIONS, "num_cpus": 128} + eligible = self.selector._filter_eligible_resources("batch_job", opts, self.user) + names = [name for name, _ in eligible] + self.assertNotIn("cpu_cluster", names) + self.assertIn("gpu_cluster", names) + + def test_filter_memory_capacity(self): + # cpu_cluster max_memory_mb=262144 (256 GB); request 300 GB = 307200 MB + opts = {**BASE_OPTIONS, "memory_mb": 307200} + eligible = self.selector._filter_eligible_resources("batch_job", opts, self.user) + names = [name for name, _ in eligible] + self.assertNotIn("cpu_cluster", names) + + def test_filter_time_limit(self): + # cpu_cluster max_time_minutes=10080 (7 days); request 12000 minutes + opts = {**BASE_OPTIONS, "time_limit_min": 12000} + eligible = self.selector._filter_eligible_resources("batch_job", opts, self.user) + names = [name for name, _ in eligible] + self.assertNotIn("cpu_cluster", names) + + def test_resources_without_capabilities_config_are_skipped(self): + bare_resource = MagicMock() + bare_resource.config = {} + self.selector.available_resources = {"bare": bare_resource} + eligible = self.selector._filter_eligible_resources("job", BASE_OPTIONS, self.user) + self.assertEqual(eligible, []) + + def test_unhealthy_resource_filtered(self): + self.available["cpu_cluster"].config["connection_details"]["health_check_url"] = "http://example.com/health" + self.available["cpu_cluster"].health_check.return_value = {"connected": False} + eligible = self.selector._filter_eligible_resources("job", BASE_OPTIONS, self.user) + names = [name for name, _ in eligible] + self.assertNotIn("cpu_cluster", names) + + def test_resource_filtered_when_auth_denies(self): + self.available["cpu_cluster"].resource_user_authentication.has_permission.return_value = False + eligible = self.selector._filter_eligible_resources("job", BASE_OPTIONS, self.user) + names = [name for name, _ in eligible] + self.assertNotIn("cpu_cluster", names) + + def test_resource_included_when_auth_allows(self): + self.available["cpu_cluster"].resource_user_authentication.has_permission.return_value = MagicMock() + eligible = self.selector._filter_eligible_resources("job", BASE_OPTIONS, self.user) + names = [name for name, _ in eligible] + self.assertIn("cpu_cluster", names) + + def test_resource_filtered_when_auth_raises(self): + self.available["cpu_cluster"].resource_user_authentication.has_permission.side_effect = Exception("auth error") + eligible = self.selector._filter_eligible_resources("job", BASE_OPTIONS, self.user) + names = [name for name, _ in eligible] + self.assertNotIn("cpu_cluster", names) + + +class TestScoreResources(TestCase): + + def setUp(self): + self.available = make_two_resource_dict() + self.selector = ResourceSelector(self.available) + self.selector.load_tracker.get_job_count = MagicMock(return_value=0) + + def test_gpu_job_scores_gpu_resource_highest(self): + eligible = list(self.available.items()) + opts = {**BASE_OPTIONS, "gpu_enabled": True, "num_cpus": 8} + scored = self.selector._score_resources(eligible, "jupyter_lab", opts) + self.assertEqual(scored[0]["resource_name"], "gpu_cluster") + + def test_cpu_job_on_cpu_resource_gets_efficiency_bonus(self): + eligible = list(self.available.items()) + scored = self.selector._score_resources(eligible, "jupyter_lab", BASE_OPTIONS) + cpu_score = next(r["score"] for r in scored if r["resource_name"] == "cpu_cluster") + # priority(5)*10 + job_type_match(50) + efficiency(20) - cost(0.8*5) = 116 + self.assertAlmostEqual(cpu_score, 116.0) + + def test_high_utilization_reduces_score(self): + eligible = [("cpu_cluster", self.available["cpu_cluster"])] + + self.selector.load_tracker.get_job_count = MagicMock(return_value=0) + scored_zero = self.selector._score_resources(eligible, "batch_job", BASE_OPTIONS) + + self.selector.load_tracker.get_job_count = MagicMock(return_value=50) + scored_half = self.selector._score_resources(eligible, "batch_job", BASE_OPTIONS) + + self.assertGreater(scored_zero[0]["score"], scored_half[0]["score"]) + + def test_lower_cost_resource_scores_higher_all_else_equal(self): + expensive = make_resource(priority=5, cost=2.0) + cheap = make_resource(priority=5, cost=0.5) + eligible = [("expensive", expensive), ("cheap", cheap)] + self.selector.load_tracker.get_job_count = MagicMock(return_value=0) + scored = self.selector._score_resources(eligible, "other_job", BASE_OPTIONS) + self.assertEqual(scored[0]["resource_name"], "cheap") + + def test_no_utilization_when_max_concurrent_jobs_absent(self): + r = make_resource(priority=5, cost=1.0) + del r.config["capabilities"]["max_concurrent_jobs"] + eligible = [("r", r)] + self.selector.load_tracker.get_job_count = MagicMock(return_value=99) + scored = self.selector._score_resources(eligible, "job", BASE_OPTIONS) + # utilization should be 0 so load has no effect on score + self.assertEqual(scored[0]["utilization"], 0.0) + + +class TestSelectResource(TestCase): + + def setUp(self): + self.available = make_two_resource_dict() + self.selector = ResourceSelector(self.available) + self.selector.load_tracker.get_job_count = MagicMock(return_value=0) + self.user = MagicMock() + + def test_gpu_job_selects_gpu_cluster(self): + opts = {**BASE_OPTIONS, "gpu_enabled": True, "num_cpus": 8} + key, resource = self.selector.select_resource("jupyter_lab", opts, self.user) + self.assertEqual(key, "gpu_cluster") + + def test_fallback_when_no_eligible_resources(self): + opts = {**BASE_OPTIONS, "num_cpus": 99999} + with self.settings(UWS_CONFIG={"main_resource": "gpu_cluster"}): + key, resource = self.selector.select_resource("jupyter_lab", opts, self.user) + self.assertEqual(key, "gpu_cluster") + self.assertIs(resource, self.available["gpu_cluster"]) + + def test_highest_scoring_resource_is_selected(self): + # gpu_cluster has priority=10, cpu_cluster has priority=5 + # For a CPU job both are eligible; gpu_cluster still wins on priority + key, _ = self.selector.select_resource("jupyter_lab", BASE_OPTIONS, self.user) + self.assertEqual(key, "gpu_cluster") + + +class TestLoadTracker(TestCase): + + def test_get_job_count_queries_db(self): + tracker = LoadTracker() + with patch("user_workspaces_server.services.resource_selector.LoadTracker.get_job_count") as mock_count: + mock_count.return_value = 5 + result = tracker.get_job_count("SlurmAPIResource") + self.assertEqual(result, 5) diff --git a/src/user_workspaces_server/config_schemas/schemas/resources/LocalResource.json b/src/user_workspaces_server/config_schemas/schemas/resources/LocalResource.json index a9f747b0..497823fb 100644 --- a/src/user_workspaces_server/config_schemas/schemas/resources/LocalResource.json +++ b/src/user_workspaces_server/config_schemas/schemas/resources/LocalResource.json @@ -38,6 +38,75 @@ "type": "object", "default": {}, "description": "Connection configuration (typically empty for LocalResource)" + }, + "capabilities": { + "type": "object", + "description": "Hardware capabilities used by the intelligent resource selector to filter ineligible resources. Omit this section to exclude the resource from intelligent selection (it will still be used as main_resource fallback).", + "properties": { + "gpu_enabled": { + "type": "boolean", + "default": false, + "description": "Whether this resource has GPU support" + }, + "gpu_types": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "description": "Available GPU models" + }, + "max_cpus": { + "type": "integer", + "minimum": 1, + "description": "Maximum CPUs that can be requested per job" + }, + "max_memory_mb": { + "type": "integer", + "minimum": 1, + "description": "Maximum memory per job in MB (matches the memory_mb resource option)" + }, + "max_time_minutes": { + "type": "integer", + "minimum": 1, + "description": "Maximum walltime per job in minutes (matches the time_limit_min resource option)" + }, + "max_gpus": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "Maximum GPUs per job (0 = no GPU support)" + }, + "max_concurrent_jobs": { + "type": "integer", + "minimum": 1, + "description": "Soft concurrent-job ceiling used to calculate utilization fraction for scoring. Omit to disable utilization scoring for this resource." + } + }, + "required": ["gpu_enabled", "max_cpus", "max_memory_mb", "max_time_minutes"] + }, + "selection_criteria": { + "type": "object", + "description": "Scoring hints used by the intelligent resource selector to rank eligible resources.", + "properties": { + "priority": { + "type": "integer", + "minimum": 0, + "default": 5, + "description": "Base priority score (0-100). Higher values make this resource preferred when all else is equal." + }, + "cost_per_core_hour": { + "type": "number", + "minimum": 0, + "default": 1.0, + "description": "Relative cost metric per core-hour." + }, + "preferred_for_job_types": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "description": "Job type keys that this resource is preferred for." + } + }, + "required": ["priority"] } }, "examples": [ @@ -47,7 +116,20 @@ "storage": "main_storage", "user_authentication": "main_auth", "passthrough_domain": "127.0.0.1:8000", - "connection_details": {} + "connection_details": {}, + "capabilities": { + "gpu_enabled": false, + "max_cpus": 4, + "max_memory_mb": 16384, + "max_time_minutes": 480, + "max_gpus": 0, + "max_concurrent_jobs": 10 + }, + "selection_criteria": { + "priority": 1, + "cost_per_core_hour": 0.0, + "preferred_for_job_types": ["local_test_job"] + } } ] } \ No newline at end of file diff --git a/src/user_workspaces_server/config_schemas/schemas/resources/SlurmAPIResource.json b/src/user_workspaces_server/config_schemas/schemas/resources/SlurmAPIResource.json index 2d9baf8a..d21c818d 100644 --- a/src/user_workspaces_server/config_schemas/schemas/resources/SlurmAPIResource.json +++ b/src/user_workspaces_server/config_schemas/schemas/resources/SlurmAPIResource.json @@ -64,24 +64,115 @@ }, "health_check_url": { "type": "string", - "description": "URL for health check endpoint" + "description": "URL for health check endpoint. When present, the resource is health-checked before each selection." } } + }, + "capabilities": { + "type": "object", + "description": "Hardware capabilities used by the intelligent resource selector to filter ineligible resources. Omit this section to exclude the resource from intelligent selection (it will still be used as main_resource fallback).", + "properties": { + "gpu_enabled": { + "type": "boolean", + "default": false, + "description": "Whether this resource has GPU support" + }, + "gpu_types": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "description": "Available GPU models (e.g. ['A100', 'V100'])" + }, + "max_cpus": { + "type": "integer", + "minimum": 1, + "description": "Maximum CPUs that can be requested per job" + }, + "max_memory_mb": { + "type": "integer", + "minimum": 1, + "description": "Maximum memory per job in MB (matches the memory_mb resource option)" + }, + "max_time_minutes": { + "type": "integer", + "minimum": 1, + "description": "Maximum walltime per job in minutes (matches the time_limit_min resource option)" + }, + "max_gpus": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "Maximum GPUs per job (0 = no GPU support)" + }, + "max_concurrent_jobs": { + "type": "integer", + "minimum": 1, + "description": "Soft concurrent-job ceiling used to calculate utilization fraction for scoring. Omit to disable utilization scoring for this resource." + }, + "partitions": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "description": "Available SLURM partition names" + } + }, + "required": ["gpu_enabled", "max_cpus", "max_memory_mb", "max_time_minutes"] + }, + "selection_criteria": { + "type": "object", + "description": "Scoring hints used by the intelligent resource selector to rank eligible resources.", + "properties": { + "priority": { + "type": "integer", + "minimum": 0, + "default": 5, + "description": "Base priority score (0-100). Higher values make this resource preferred when all else is equal." + }, + "cost_per_core_hour": { + "type": "number", + "minimum": 0, + "default": 1.0, + "description": "Relative cost metric per core-hour. Lower values are preferred for cost-optimised routing." + }, + "preferred_for_job_types": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "description": "Job type keys (matching available_job_types keys) that this resource is preferred for. Matching jobs receive a bonus score." + } + }, + "required": ["priority"] } }, "examples": [ { - "name": "SLURM Cluster", + "name": "SLURM GPU Cluster", "resource_type": "SlurmAPIResource", "storage": "main_storage", "user_authentication": "main_auth", "connection_details": { - "root_url": "https://slurm.example.com/api", + "root_url": "https://slurm-gpu.example.com/api", "api_token": "your-api-token-here", - "token_lifespan": "3600" + "token_lifespan": "3600", + "health_check_url": "https://slurm-gpu.example.com/health" }, - "cpu_partition": "cpu", - "gpu_partition": "gpu" + "cpu_partition": "GPU-shared", + "gpu_partition": "GPU", + "capabilities": { + "gpu_enabled": true, + "gpu_types": ["A100", "V100"], + "max_cpus": 128, + "max_memory_mb": 524288, + "max_time_minutes": 2880, + "max_gpus": 8, + "max_concurrent_jobs": 50, + "partitions": ["GPU", "GPU-shared"] + }, + "selection_criteria": { + "priority": 10, + "cost_per_core_hour": 1.5, + "preferred_for_job_types": ["jupyter_lab"] + } } ] } \ No newline at end of file diff --git a/src/user_workspaces_server/services/__init__.py b/src/user_workspaces_server/services/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/user_workspaces_server/services/resource_selector.py b/src/user_workspaces_server/services/resource_selector.py new file mode 100644 index 00000000..f42b4df1 --- /dev/null +++ b/src/user_workspaces_server/services/resource_selector.py @@ -0,0 +1,258 @@ +import logging +from typing import Dict, List, Tuple + +from django.conf import settings + +logger = logging.getLogger(__name__) + + +class ResourceSelectionError(Exception): + """Raised when no suitable resource can be found.""" + + pass + + +class LoadTracker: + """Track current job counts per resource for utilization scoring.""" + + def get_job_count(self, resource_name: str) -> int: + """Get current active job count for resource (by class name).""" + from user_workspaces_server.models import Job + + return Job.objects.filter( + resource_name=resource_name, + status__in=[Job.Status.PENDING, Job.Status.RUNNING], + ).count() + + +class ResourceSelector: + """ + Intelligent resource selection based on job requirements, + resource capabilities, user permissions, and current load. + """ + + def __init__(self, available_resources: Dict): + self.available_resources = available_resources + self.load_tracker = LoadTracker() + + def select_resource(self, job_type: str, resource_options: Dict, user): + """ + Two-phase resource selection: filter then score. + + Returns: + Tuple of (resource_key, AbstractResource). resource_key is the + key in available_resources (e.g. "hive_gpu_cluster"). + + Falls back to main_resource if no eligible resources are found. + """ + eligible = self._filter_eligible_resources(job_type, resource_options, user) + + if not eligible: + logger.warning( + "No eligible resources found for job_type=%s resource_options=%s; " + "falling back to main_resource", + job_type, + resource_options, + ) + main_resource_key = settings.UWS_CONFIG["main_resource"] + return main_resource_key, self.available_resources[main_resource_key] + + scored = self._score_resources(eligible, job_type, resource_options) + best = scored[0] + + logger.info( + "Resource selection completed", + extra={ + "job_type": job_type, + "resource_options": resource_options, + "eligible_count": len(eligible), + "selected_resource": best["resource_name"], + "score": best["score"], + }, + ) + + return best["resource_name"], best["resource"] + + # ------------------------------------------------------------------ + # Phase 1: Filter + # ------------------------------------------------------------------ + + def _filter_eligible_resources( + self, job_type: str, resource_options: Dict, user + ) -> List[Tuple[str, object]]: + eligible = [] + + for resource_name, resource in self.available_resources.items(): + capabilities = resource.config.get("capabilities") + selection = resource.config.get("selection_criteria") + + # Resources without capabilities/selection_criteria config are + # skipped; fallback to main_resource handles this case. + if capabilities is None or selection is None: + logger.debug( + "Resource %s has no capabilities/selection_criteria config; skipping", + resource_name, + ) + continue + + # Check 1: GPU requirement + if resource_options.get("gpu_enabled", False): + if not capabilities.get("gpu_enabled", False): + logger.debug("Resource %s filtered: no GPU support", resource_name) + continue + + # Check 2: CPU capacity + requested_cpus = resource_options.get("num_cpus", 0) + if requested_cpus > capabilities.get("max_cpus", 0): + logger.debug( + "Resource %s filtered: requested %s CPUs > max %s", + resource_name, + requested_cpus, + capabilities.get("max_cpus"), + ) + continue + + # Check 3: Memory capacity (both in MB) + requested_memory_mb = resource_options.get("memory_mb", 0) + if requested_memory_mb > capabilities.get("max_memory_mb", 0): + logger.debug( + "Resource %s filtered: requested %s MB memory > max %s MB", + resource_name, + requested_memory_mb, + capabilities.get("max_memory_mb"), + ) + continue + + # Check 4: Time limit (both in minutes) + requested_minutes = resource_options.get("time_limit_min", 0) + if requested_minutes > capabilities.get("max_time_minutes", 0): + logger.debug( + "Resource %s filtered: requested %s min > max %s min", + resource_name, + requested_minutes, + capabilities.get("max_time_minutes"), + ) + continue + + # Check 5: User authorization via the resource's own auth method + if not self._user_has_resource_permission(user, resource): + logger.debug( + "Resource %s filtered: user %s lacks permission", + resource_name, + getattr(user, "username", user), + ) + continue + + # Check 6: Health status + if not self._is_resource_healthy(resource): + logger.debug("Resource %s filtered: health check failed", resource_name) + continue + + eligible.append((resource_name, resource)) + + return eligible + + # ------------------------------------------------------------------ + # Phase 2: Score + # ------------------------------------------------------------------ + + def _score_resources( + self, eligible: List[Tuple], job_type: str, resource_options: Dict + ) -> List[Dict]: + scored = [] + + for resource_name, resource in eligible: + score = 0.0 + selection = resource.config["selection_criteria"] + capabilities = resource.config["capabilities"] + + # Factor 1: Base priority (0-100 points) + score += selection.get("priority", 0) * 10 + + # Factor 2: Job type preference (+50 if preferred) + if job_type in selection.get("preferred_for_job_types", []): + score += 50 + + # Factor 3: Resource efficiency (0-30 points) + if resource_options.get("gpu_enabled"): + score += 30 # GPU job on GPU resource + elif not capabilities.get("gpu_enabled"): + score += 20 # CPU job on CPU-only resource (right-sizing) + + # Factor 4: Cost optimization + cost = selection.get("cost_per_core_hour", 1.0) + score -= cost * 5 + + # Factor 5: Current load (-20 to 0 points) + utilization = self._get_utilization(resource_name, resource) + score -= utilization * 20 + + scored.append( + { + "resource_name": resource_name, + "resource": resource, + "score": score, + "utilization": utilization, + } + ) + + scored.sort(key=lambda x: x["score"], reverse=True) + return scored + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _get_utilization(self, resource_name: str, resource) -> float: + """Return utilization as a fraction [0, 1] based on active job count.""" + max_jobs = resource.config.get("capabilities", {}).get("max_concurrent_jobs", 0) + if not max_jobs: + return 0.0 + + current = self.load_tracker.get_job_count(type(resource).__name__) + return min(current / max_jobs, 1.0) + + def _is_resource_healthy(self, resource) -> bool: + """ + Returns True if the resource passes its health check. + + NOTE: This calls health_check() which makes a live HTTP request on + every job submission. Consider caching results if this becomes a + bottleneck (e.g. 30s TTL). + """ + connection_details = resource.config.get("connection_details", {}) + if not connection_details.get("health_check_url"): + # No health check URL configured; assume healthy. + return True + + try: + result = resource.health_check() + return result.get("connected", False) + except Exception as e: + logger.warning("Health check raised exception for %s: %s", resource, e) + return False + + def _user_has_resource_permission(self, user, resource) -> bool: + """ + Check if the user has permission to use this resource by delegating to + the resource's own user authentication method (has_permission). + + For GlobusUserAuthentication this verifies the external user mapping + exists and, when the auth controller is configured with + allowed_globus_groups, confirms the user is still a member. + + Returns True for auth methods whose has_permission always succeeds + (e.g. LocalUserAuthentication) or when the user passes all checks. + Returns False if the auth method denies access or raises an exception. + """ + try: + result = resource.resource_user_authentication.has_permission(user) + return bool(result) + except Exception as e: + logger.warning( + "has_permission raised for resource %s user %s: %s", + type(resource).__name__, + getattr(user, "username", user), + e, + ) + return False diff --git a/src/user_workspaces_server/views/workspace_view.py b/src/user_workspaces_server/views/workspace_view.py index cd0f8d35..717aaaa6 100644 --- a/src/user_workspaces_server/views/workspace_view.py +++ b/src/user_workspaces_server/views/workspace_view.py @@ -14,6 +14,10 @@ from user_workspaces_server import models, utils from user_workspaces_server.exceptions import WorkspaceClientException +from user_workspaces_server.services.resource_selector import ( + ResourceSelector, + ResourceSelectionError, +) from user_workspaces_server.tasks import async_update_workspace logger = logging.getLogger(__name__) @@ -272,11 +276,26 @@ def put(self, request, workspace_id, put_type=None): if not isinstance(resource_options, dict): raise ParseError("Resource options not JSON.") - # TODO: Grabbing the resource needs to be a bit more intelligent - resource = apps.get_app_config("user_workspaces_server").main_resource + selector = ResourceSelector( + apps.get_app_config("user_workspaces_server").available_resources + ) - # TODO: GPU support "gpu_enabled": true, - # {"num_cpus": 0, "memory_mb": 0, "time_limit_minutes": 30} + try: + resource_key, resource = selector.select_resource( + job_type=job_type, + resource_options=resource_options, + user=workspace.user_id, + ) + except ResourceSelectionError as e: + logger.error("Resource selection failed: %s", str(e)) + raise ParseError(f"Unable to find suitable compute resource for this job. {e}") + + logger.info( + "Selected resource: %s (%s) for job type: %s", + resource_key, + type(resource).__name__, + job_type, + ) if not resource.validate_options(resource_options): raise ParseError("Invalid resource options found.") @@ -310,13 +329,17 @@ def put(self, request, workspace_id, put_type=None): "user_workspaces_server" ).available_job_types.get(job_type) + environment_details = job_type_config.get("environment_details", {}) + job_type_env = environment_details.get( + resource_key, + environment_details.get(settings.UWS_CONFIG["main_resource"]), + ) + job_to_launch = utils.generate_controller_object( job_type_config["job_type"], "jobtypes", { - "config": job_type_config["environment_details"][ - settings.UWS_CONFIG["main_resource"] - ], + "config": job_type_env, "job_details": model_to_dict(job), }, ) From c5a3b98533e2e94b892a4ddc8bebabb75c440ea5 Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Mon, 20 Apr 2026 15:16:11 -0400 Subject: [PATCH 02/22] fix: adjust utilization calculation. If we're @ max util, then we should penalize heavily. --- src/user_workspaces_server/services/resource_selector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/user_workspaces_server/services/resource_selector.py b/src/user_workspaces_server/services/resource_selector.py index f42b4df1..afbb31b3 100644 --- a/src/user_workspaces_server/services/resource_selector.py +++ b/src/user_workspaces_server/services/resource_selector.py @@ -183,9 +183,10 @@ def _score_resources( cost = selection.get("cost_per_core_hour", 1.0) score -= cost * 5 - # Factor 5: Current load (-20 to 0 points) + # Factor 5: Current load — scale score down by utilization so a + # fully-loaded resource can never beat an idle one on priority alone. utilization = self._get_utilization(resource_name, resource) - score -= utilization * 20 + score *= (1 - utilization) scored.append( { From f5611eb75f11411224e03c0abb8dc6f0ebf43041 Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Mon, 20 Apr 2026 15:49:38 -0400 Subject: [PATCH 03/22] fix: remove utilization from return --- src/user_workspaces_server/services/resource_selector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/user_workspaces_server/services/resource_selector.py b/src/user_workspaces_server/services/resource_selector.py index afbb31b3..7a9eb9f9 100644 --- a/src/user_workspaces_server/services/resource_selector.py +++ b/src/user_workspaces_server/services/resource_selector.py @@ -193,7 +193,6 @@ def _score_resources( "resource_name": resource_name, "resource": resource, "score": score, - "utilization": utilization, } ) From 5473d5ef418d2d3e8030d4a8855baed733481bda Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Fri, 24 Apr 2026 16:03:29 -0400 Subject: [PATCH 04/22] fix: use the resource that was selected rather than the default resource everywhere --- .../controllers/jobtypes/appyter_job.py | 2 +- .../controllers/jobtypes/jupyter_lab_job.py | 2 +- .../controllers/jobtypes/yac_job.py | 2 +- src/user_workspaces_server/tasks.py | 16 ++++++++++------ .../views/workspace_view.py | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/user_workspaces_server/controllers/jobtypes/appyter_job.py b/src/user_workspaces_server/controllers/jobtypes/appyter_job.py index 1949882a..c9e25f8d 100644 --- a/src/user_workspaces_server/controllers/jobtypes/appyter_job.py +++ b/src/user_workspaces_server/controllers/jobtypes/appyter_job.py @@ -34,7 +34,7 @@ def get_script(self, template_params=None): return script def status_check(self, job_model): - resource = apps.get_app_config("user_workspaces_server").main_resource + resource = apps.get_app_config("user_workspaces_server").available_resources[job_model.resource_name] if job_model.status == models.Job.Status.FAILED: return { diff --git a/src/user_workspaces_server/controllers/jobtypes/jupyter_lab_job.py b/src/user_workspaces_server/controllers/jobtypes/jupyter_lab_job.py index 32c8ccba..0f11347e 100644 --- a/src/user_workspaces_server/controllers/jobtypes/jupyter_lab_job.py +++ b/src/user_workspaces_server/controllers/jobtypes/jupyter_lab_job.py @@ -29,7 +29,7 @@ def get_script(self, template_params=None): def status_check(self, job_model): output_file_name = f"JupyterLabJob_{job_model.id}_output.log" - resource = apps.get_app_config("user_workspaces_server").main_resource + resource = apps.get_app_config("user_workspaces_server").available_resources[job_model.resource_name] if job_model.status == models.Job.Status.FAILED: return { diff --git a/src/user_workspaces_server/controllers/jobtypes/yac_job.py b/src/user_workspaces_server/controllers/jobtypes/yac_job.py index 482ee89a..8fa0dcf2 100644 --- a/src/user_workspaces_server/controllers/jobtypes/yac_job.py +++ b/src/user_workspaces_server/controllers/jobtypes/yac_job.py @@ -42,7 +42,7 @@ def get_script(self, template_params=None): return script def status_check(self, job_model): - resource = apps.get_app_config("user_workspaces_server").main_resource + resource = apps.get_app_config("user_workspaces_server").available_resources[job_model.resource_name] if job_model.status == models.Job.Status.FAILED: return { diff --git a/src/user_workspaces_server/tasks.py b/src/user_workspaces_server/tasks.py index 99f4bf73..d386ea18 100644 --- a/src/user_workspaces_server/tasks.py +++ b/src/user_workspaces_server/tasks.py @@ -26,7 +26,7 @@ def update_job_status(job_id): logger.exception(f"Job {job_id} does not exist.") raise - resource = apps.get_app_config("user_workspaces_server").main_resource + resource = apps.get_app_config("user_workspaces_server").available_resources[job.resource_name] resource_job_info = resource.get_resource_job(job) current_job_status = resource_job_info["status"] @@ -70,13 +70,17 @@ def update_job_status(job_id): job.job_type ) + environment_details = job_type_config.get("environment_details", {}) + job_type_env = environment_details.get( + resource_key, + environment_details.get(settings.UWS_CONFIG["main_resource"]), + ) + job_type = utils.generate_controller_object( job_type_config["job_type"], "jobtypes", { - "config": job_type_config["environment_details"][ - settings.UWS_CONFIG["main_resource"] - ], + "config": job_type_env, "job_details": model_to_dict(job), }, ) @@ -161,7 +165,7 @@ def update_job_core_hours(job_id): logger.exception(f"Job {job_id} does not exist.") raise - resource = apps.get_app_config("user_workspaces_server").main_resource + resource = apps.get_app_config("user_workspaces_server").available_resources[job.resource_name] job.core_hours = resource.get_job_core_hours(job) job.save() user_quota = models.UserQuota.objects.filter(user_id=job.workspace_id.user_id).first() @@ -179,7 +183,7 @@ def stop_job(job_id): logger.exception(f"Job {job_id} does not exist.") raise - resource = apps.get_app_config("user_workspaces_server").main_resource + resource = apps.get_app_config("user_workspaces_server").available_resources[job.resource_name] if not resource.stop_job(job): job.status = models.Job.Status.FAILED job.save() diff --git a/src/user_workspaces_server/views/workspace_view.py b/src/user_workspaces_server/views/workspace_view.py index 717aaaa6..8b25ce23 100644 --- a/src/user_workspaces_server/views/workspace_view.py +++ b/src/user_workspaces_server/views/workspace_view.py @@ -314,7 +314,7 @@ def put(self, request, workspace_id, put_type=None): "current_job_details": {}, }, "resource_options": translated_options, - "resource_name": type(resource).__name__, + "resource_name": resource_key, "status": "pending", "resource_job_id": -1, "core_hours": 0, From 2b2730c490700ebf546ee39ec8af8c23cb2e5586 Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Fri, 24 Apr 2026 16:07:56 -0400 Subject: [PATCH 05/22] fix: bad copy pasta --- src/user_workspaces_server/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/user_workspaces_server/tasks.py b/src/user_workspaces_server/tasks.py index d386ea18..4e5f7f77 100644 --- a/src/user_workspaces_server/tasks.py +++ b/src/user_workspaces_server/tasks.py @@ -72,7 +72,7 @@ def update_job_status(job_id): environment_details = job_type_config.get("environment_details", {}) job_type_env = environment_details.get( - resource_key, + job.resource_name, environment_details.get(settings.UWS_CONFIG["main_resource"]), ) From e6c9765027a6a666efe6d69a11cd0f147253e9af Mon Sep 17 00:00:00 2001 From: jpuerto-psc <68066250+jpuerto-psc@users.noreply.github.com> Date: Wed, 10 Jun 2026 17:34:08 +0000 Subject: [PATCH 06/22] Generate new BUILD file --- BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD b/BUILD index db915e59..100cd683 100644 --- a/BUILD +++ b/BUILD @@ -1 +1 @@ -0a68c3b +a832df1 From 69d9c940e4ee5dbd42a9603e961edb49fe36c41e Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Mon, 22 Jun 2026 14:35:31 -0400 Subject: [PATCH 07/22] fix: check against resource name as well, otherwise everyone will have access to the B2 cluster as well (in code, not in reality). --- .../userauthenticationmethods/psc_api_user_authentication.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py b/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py index 825f876b..de2a4a71 100644 --- a/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py +++ b/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py @@ -275,7 +275,8 @@ def get_external_user(self, external_user_info): for allocation_user in external_user.get("allocationUsers", []): allocation = allocation_user.get("allocation", {}) - if allocation.get("grant", {}).get("number", False) == self.grant_number: + if (allocation.get("grant", {}).get("number", False) == self.grant_number + and allocation.get("resource", {}).get("name", False) == self.resource_name): gid = allocation.get("gid", False) if not gid: From 506535c06b3b9ecbb229b7e349ba0e5eb3af3e92 Mon Sep 17 00:00:00 2001 From: jpuerto-psc <68066250+jpuerto-psc@users.noreply.github.com> Date: Mon, 22 Jun 2026 18:35:55 +0000 Subject: [PATCH 08/22] Generate new BUILD file --- BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD b/BUILD index 100cd683..6f27da13 100644 --- a/BUILD +++ b/BUILD @@ -1 +1 @@ -a832df1 +b06a847 From 70732760e3ad729efd54346ead3ce9ace62c158b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 22 Jun 2026 18:36:23 +0000 Subject: [PATCH 09/22] Generate new BUILD file --- BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD b/BUILD index 6f27da13..4e9ffcd6 100644 --- a/BUILD +++ b/BUILD @@ -1 +1 @@ -b06a847 +9fb8a73 From a6533c3e85ec8bdfd4d7b77a6ac5433e69f80ac4 Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Wed, 24 Jun 2026 15:32:47 -0400 Subject: [PATCH 10/22] fix: check against resource name as well, otherwise everyone will have access to the B2 cluster as well (in code, not in reality). --- src/user_workspaces_server/apps.py | 4 ++-- .../controllers/resources/abstract_resource.py | 1 + .../controllers/resources/slurm_api_resource.py | 4 ++-- .../abstract_user_authentication.py | 1 + .../userauthenticationmethods/globus_user_authentication.py | 6 +++--- .../userauthenticationmethods/local_user_authentication.py | 6 +++--- .../psc_api_user_authentication.py | 6 +++--- 7 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/user_workspaces_server/apps.py b/src/user_workspaces_server/apps.py index a2ee7016..07f50356 100644 --- a/src/user_workspaces_server/apps.py +++ b/src/user_workspaces_server/apps.py @@ -32,7 +32,7 @@ def ready(self): utils.generate_controller_object( user_authentication_dict["user_authentication_type"], "userauthenticationmethods", - {"config": user_authentication_dict}, + {"config": {**user_authentication_dict, "auth_name": user_authentication_name}}, ) ) @@ -53,7 +53,7 @@ def ready(self): resource_dict["resource_type"], "resources", { - "config": resource_dict, + "config": {**resource_dict, "resource_name": resource_name}, "resource_storage": self.available_storage_methods[resource_dict["storage"]], "resource_user_authentication": self.available_user_authentication_methods[ resource_dict["user_authentication"] diff --git a/src/user_workspaces_server/controllers/resources/abstract_resource.py b/src/user_workspaces_server/controllers/resources/abstract_resource.py index 60f9b896..44c7864a 100644 --- a/src/user_workspaces_server/controllers/resources/abstract_resource.py +++ b/src/user_workspaces_server/controllers/resources/abstract_resource.py @@ -14,6 +14,7 @@ class AbstractResource(ABC): def __init__(self, config, resource_storage, resource_user_authentication): self.config = config + self.resource_name = config.get("resource_name", type(self).__name__) self.resource_storage = resource_storage self.resource_user_authentication = resource_user_authentication self.passthrough_domain = config.get("passthrough_domain", "") diff --git a/src/user_workspaces_server/controllers/resources/slurm_api_resource.py b/src/user_workspaces_server/controllers/resources/slurm_api_resource.py index 6d66727e..cc200dc9 100644 --- a/src/user_workspaces_server/controllers/resources/slurm_api_resource.py +++ b/src/user_workspaces_server/controllers/resources/slurm_api_resource.py @@ -225,7 +225,7 @@ def get_user_token(self, external_user): external_user_mapping = self.resource_user_authentication.get_external_user_mapping( { "user_id": external_user.user_id, - "user_authentication_name": f"{type(self).__name__}Authentication", + "user_authentication_name": f"{self.resource_name}Authentication", } ) @@ -234,7 +234,7 @@ def get_user_token(self, external_user): external_user_mapping = self.resource_user_authentication.create_external_user_mapping( { "user_id": external_user.user_id, - "user_authentication_name": f"{type(self).__name__}Authentication", + "user_authentication_name": f"{self.resource_name}Authentication", "external_user_id": external_user.user_id, "external_username": external_user.external_username, "external_user_details": {"token": token}, diff --git a/src/user_workspaces_server/controllers/userauthenticationmethods/abstract_user_authentication.py b/src/user_workspaces_server/controllers/userauthenticationmethods/abstract_user_authentication.py index 6281948b..1c53fc43 100644 --- a/src/user_workspaces_server/controllers/userauthenticationmethods/abstract_user_authentication.py +++ b/src/user_workspaces_server/controllers/userauthenticationmethods/abstract_user_authentication.py @@ -11,6 +11,7 @@ class AbstractUserAuthentication(ABC): def __init__(self, config): + self.auth_name = config.get("auth_name", type(self).__name__) self.connection_details = config.get("connection_details", {}) @abstractmethod diff --git a/src/user_workspaces_server/controllers/userauthenticationmethods/globus_user_authentication.py b/src/user_workspaces_server/controllers/userauthenticationmethods/globus_user_authentication.py index 6a56d906..92730d40 100644 --- a/src/user_workspaces_server/controllers/userauthenticationmethods/globus_user_authentication.py +++ b/src/user_workspaces_server/controllers/userauthenticationmethods/globus_user_authentication.py @@ -36,7 +36,7 @@ def has_permission(self, internal_user): ExternalUserMapping on success, False on failure """ external_user_mapping = self.get_external_user_mapping( - {"user_id": internal_user, "user_authentication_name": type(self).__name__} + {"user_id": internal_user, "user_authentication_name": self.auth_name} ) if not external_user_mapping: @@ -146,7 +146,7 @@ def api_authenticate(self, request): external_user_mapping = self.get_external_user_mapping( { "external_user_id": globus_user_info["sub"], - "user_authentication_name": type(self).__name__, + "user_authentication_name": self.auth_name, } ) @@ -184,7 +184,7 @@ def api_authenticate(self, request): self.create_external_user_mapping( { "user_id": internal_user, - "user_authentication_name": type(self).__name__, + "user_authentication_name": self.auth_name, "external_user_id": globus_user_info["sub"], "external_username": globus_user_info["username"], "external_user_details": globus_user_info, diff --git a/src/user_workspaces_server/controllers/userauthenticationmethods/local_user_authentication.py b/src/user_workspaces_server/controllers/userauthenticationmethods/local_user_authentication.py index b3d94ed8..e5b1a2a9 100644 --- a/src/user_workspaces_server/controllers/userauthenticationmethods/local_user_authentication.py +++ b/src/user_workspaces_server/controllers/userauthenticationmethods/local_user_authentication.py @@ -21,7 +21,7 @@ def __init__(self, config): def has_permission(self, internal_user): external_user_mapping = self.get_external_user_mapping( - {"user_id": internal_user, "user_authentication_name": type(self).__name__} + {"user_id": internal_user, "user_authentication_name": self.auth_name} ) if not external_user_mapping: @@ -42,7 +42,7 @@ def has_permission(self, internal_user): external_user_mapping = self.create_external_user_mapping( { "user_id": internal_user, - "user_authentication_name": type(self).__name__, + "user_authentication_name": self.auth_name, "external_user_id": external_user["external_user_uid"], "external_username": external_user["external_username"], "external_user_details": external_user["external_user_details"], @@ -90,7 +90,7 @@ def api_authenticate(self, request): external_user_mapping = self.get_external_user_mapping( { - "user_authentication_name": type(self).__name__, + "user_authentication_name": self.auth_name, "external_username": user_info["username"], } ) diff --git a/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py b/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py index de2a4a71..af3df7fb 100644 --- a/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py +++ b/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py @@ -30,7 +30,7 @@ def __init__(self, config): def has_permission(self, internal_user): external_user_mapping = self.get_external_user_mapping( - {"user_id": internal_user, "user_authentication_name": type(self).__name__} + {"user_id": internal_user, "user_authentication_name": self.auth_name} ) if not external_user_mapping: @@ -54,7 +54,7 @@ def has_permission(self, internal_user): external_user_mapping = self.create_external_user_mapping( { "user_id": internal_user, - "user_authentication_name": type(self).__name__, + "user_authentication_name": self.auth_name, "external_user_id": external_user["external_user_id"], "external_username": external_user["external_username"], "external_user_details": external_user["external_user_details"], @@ -101,7 +101,7 @@ def api_authenticate(self, request): external_user_mapping = self.get_external_user_mapping( { - "user_authentication_name": type(self).__name__, + "user_authentication_name": self.auth_name, "external_username": user_info["username"], } ) From 2b6271a32ecb9516abac618bce0c833afc59cec0 Mon Sep 17 00:00:00 2001 From: jpuerto-psc <68066250+jpuerto-psc@users.noreply.github.com> Date: Wed, 24 Jun 2026 19:33:03 +0000 Subject: [PATCH 11/22] Generate new BUILD file --- BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD b/BUILD index 4e9ffcd6..fd400f95 100644 --- a/BUILD +++ b/BUILD @@ -1 +1 @@ -9fb8a73 +8307849 From 6709494d5d1b1e7fe794dbcffd3d5c1b20039b38 Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Wed, 24 Jun 2026 16:10:00 -0400 Subject: [PATCH 12/22] fix: check grant number only when defined. always check against resource name --- .../psc_api_user_authentication.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py b/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py index af3df7fb..3fbfd5ed 100644 --- a/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py +++ b/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py @@ -275,10 +275,17 @@ def get_external_user(self, external_user_info): for allocation_user in external_user.get("allocationUsers", []): allocation = allocation_user.get("allocation", {}) - if (allocation.get("grant", {}).get("number", False) == self.grant_number - and allocation.get("resource", {}).get("name", False) == self.resource_name): - gid = allocation.get("gid", False) + grant_number = allocation.get("grant", {}).get("number", False) + resource_name = allocation_user.get("resource", {}).get("name", False) + active = allocation_user.get("active", False) + + if not active: + continue + if resource_name == self.resource_name and ( + not self.grant_number or grant_number == self.grant_number + ): + gid = allocation.get("gid", False) if not gid: # If this user is not assigned to this grant, then we need to assign the user if not (gid := self.add_external_user_to_allocation(external_user["username"])): From 73e6ce5d49f9f781d6df943208b4432e1e764bc5 Mon Sep 17 00:00:00 2001 From: jpuerto-psc <68066250+jpuerto-psc@users.noreply.github.com> Date: Wed, 24 Jun 2026 20:10:24 +0000 Subject: [PATCH 13/22] Generate new BUILD file --- BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD b/BUILD index fd400f95..5b20c5b7 100644 --- a/BUILD +++ b/BUILD @@ -1 +1 @@ -8307849 +86501c8 From 7eff0565b76dd18dc7ed4bed1623eced9c603178 Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Wed, 24 Jun 2026 16:18:35 -0400 Subject: [PATCH 14/22] fix: allocation != allocation_user --- .../userauthenticationmethods/psc_api_user_authentication.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py b/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py index 3fbfd5ed..8619cb99 100644 --- a/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py +++ b/src/user_workspaces_server/controllers/userauthenticationmethods/psc_api_user_authentication.py @@ -276,8 +276,8 @@ def get_external_user(self, external_user_info): for allocation_user in external_user.get("allocationUsers", []): allocation = allocation_user.get("allocation", {}) grant_number = allocation.get("grant", {}).get("number", False) - resource_name = allocation_user.get("resource", {}).get("name", False) - active = allocation_user.get("active", False) + resource_name = allocation.get("resource", {}).get("name", False) + active = allocation.get("active", False) if not active: continue From 975258b0859baec78ee007d09c67718ff420ca3c Mon Sep 17 00:00:00 2001 From: jpuerto-psc <68066250+jpuerto-psc@users.noreply.github.com> Date: Wed, 24 Jun 2026 20:18:52 +0000 Subject: [PATCH 15/22] Generate new BUILD file --- BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD b/BUILD index 5b20c5b7..0be6159b 100644 --- a/BUILD +++ b/BUILD @@ -1 +1 @@ -86501c8 +3f3e66d From 7adb19029005a00b3e8148ba9ef42f6cbade1c96 Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Tue, 30 Jun 2026 09:29:10 -0400 Subject: [PATCH 16/22] fix: resource name not resource class name --- src/user_workspaces_server/services/resource_selector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/user_workspaces_server/services/resource_selector.py b/src/user_workspaces_server/services/resource_selector.py index 7a9eb9f9..8c52d765 100644 --- a/src/user_workspaces_server/services/resource_selector.py +++ b/src/user_workspaces_server/services/resource_selector.py @@ -209,7 +209,7 @@ def _get_utilization(self, resource_name: str, resource) -> float: if not max_jobs: return 0.0 - current = self.load_tracker.get_job_count(type(resource).__name__) + current = self.load_tracker.get_job_count(resource_name) return min(current / max_jobs, 1.0) def _is_resource_healthy(self, resource) -> bool: From 486249716647c1cf7ae7904f54f20b39e364f71d Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Tue, 30 Jun 2026 09:39:56 -0400 Subject: [PATCH 17/22] chore: update docs to reflect actual score calc --- docs/resource-selection.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/resource-selection.md b/docs/resource-selection.md index 737c9c26..8f29e232 100644 --- a/docs/resource-selection.md +++ b/docs/resource-selection.md @@ -26,12 +26,13 @@ Resources that do not have a `capabilities` section in their config are skipped Each eligible resource receives a score and the highest scorer is selected: ``` -score = (priority × 10) - + (50 if job_type in preferred_for_job_types) - + (30 if GPU job on GPU resource) - + (20 if CPU job on CPU-only resource) - - (cost_per_core_hour × 5) - - (utilization_fraction × 20) +score = ( + (priority × 10) + + (50 if job_type in preferred_for_job_types) + + (30 if GPU job on GPU resource) + + (20 if CPU job on CPU-only resource) + - (cost_per_core_hour × 5) + ) * (1 - utilization_fraction) ``` `utilization_fraction` = active jobs / `capabilities.max_concurrent_jobs`, capped at 1.0. Omit `max_concurrent_jobs` from a resource's capabilities to disable utilization scoring for that resource. From e766bdcfbc7aa5c3071dc27fd2b85e0b895d8bd3 Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Tue, 30 Jun 2026 09:51:03 -0400 Subject: [PATCH 18/22] chore: syntax --- src/tests/services/test_resource_selector.py | 40 ++++++++++++++----- src/user_workspaces_server/apps.py | 7 +++- .../controllers/jobtypes/appyter_job.py | 4 +- .../controllers/jobtypes/jupyter_lab_job.py | 4 +- .../controllers/jobtypes/yac_job.py | 4 +- .../services/resource_selector.py | 2 +- 6 files changed, 46 insertions(+), 15 deletions(-) diff --git a/src/tests/services/test_resource_selector.py b/src/tests/services/test_resource_selector.py index c48df5ac..774a9c09 100644 --- a/src/tests/services/test_resource_selector.py +++ b/src/tests/services/test_resource_selector.py @@ -11,7 +11,7 @@ def make_resource( gpu_enabled=False, max_cpus=64, - max_memory_mb=262144, # 256 GB in MB + max_memory_mb=262144, # 256 GB in MB max_time_minutes=10080, # 7 days in minutes max_gpus=0, max_concurrent_jobs=100, @@ -45,12 +45,22 @@ def make_resource( def make_two_resource_dict(): gpu = make_resource( - gpu_enabled=True, max_cpus=128, max_memory_mb=524288, max_time_minutes=2880, - priority=10, cost=1.5, preferred_for=["jupyter_lab"], + gpu_enabled=True, + max_cpus=128, + max_memory_mb=524288, + max_time_minutes=2880, + priority=10, + cost=1.5, + preferred_for=["jupyter_lab"], ) cpu = make_resource( - gpu_enabled=False, max_cpus=64, max_memory_mb=262144, max_time_minutes=10080, - priority=5, cost=0.8, preferred_for=["jupyter_lab"], + gpu_enabled=False, + max_cpus=64, + max_memory_mb=262144, + max_time_minutes=10080, + priority=5, + cost=0.8, + preferred_for=["jupyter_lab"], ) return {"gpu_cluster": gpu, "cpu_cluster": cpu} @@ -108,26 +118,34 @@ def test_resources_without_capabilities_config_are_skipped(self): self.assertEqual(eligible, []) def test_unhealthy_resource_filtered(self): - self.available["cpu_cluster"].config["connection_details"]["health_check_url"] = "http://example.com/health" + self.available["cpu_cluster"].config["connection_details"][ + "health_check_url" + ] = "http://example.com/health" self.available["cpu_cluster"].health_check.return_value = {"connected": False} eligible = self.selector._filter_eligible_resources("job", BASE_OPTIONS, self.user) names = [name for name, _ in eligible] self.assertNotIn("cpu_cluster", names) def test_resource_filtered_when_auth_denies(self): - self.available["cpu_cluster"].resource_user_authentication.has_permission.return_value = False + self.available["cpu_cluster"].resource_user_authentication.has_permission.return_value = ( + False + ) eligible = self.selector._filter_eligible_resources("job", BASE_OPTIONS, self.user) names = [name for name, _ in eligible] self.assertNotIn("cpu_cluster", names) def test_resource_included_when_auth_allows(self): - self.available["cpu_cluster"].resource_user_authentication.has_permission.return_value = MagicMock() + self.available["cpu_cluster"].resource_user_authentication.has_permission.return_value = ( + MagicMock() + ) eligible = self.selector._filter_eligible_resources("job", BASE_OPTIONS, self.user) names = [name for name, _ in eligible] self.assertIn("cpu_cluster", names) def test_resource_filtered_when_auth_raises(self): - self.available["cpu_cluster"].resource_user_authentication.has_permission.side_effect = Exception("auth error") + self.available["cpu_cluster"].resource_user_authentication.has_permission.side_effect = ( + Exception("auth error") + ) eligible = self.selector._filter_eligible_resources("job", BASE_OPTIONS, self.user) names = [name for name, _ in eligible] self.assertNotIn("cpu_cluster", names) @@ -213,7 +231,9 @@ class TestLoadTracker(TestCase): def test_get_job_count_queries_db(self): tracker = LoadTracker() - with patch("user_workspaces_server.services.resource_selector.LoadTracker.get_job_count") as mock_count: + with patch( + "user_workspaces_server.services.resource_selector.LoadTracker.get_job_count" + ) as mock_count: mock_count.return_value = 5 result = tracker.get_job_count("SlurmAPIResource") self.assertEqual(result, 5) diff --git a/src/user_workspaces_server/apps.py b/src/user_workspaces_server/apps.py index 07f50356..0548b440 100644 --- a/src/user_workspaces_server/apps.py +++ b/src/user_workspaces_server/apps.py @@ -32,7 +32,12 @@ def ready(self): utils.generate_controller_object( user_authentication_dict["user_authentication_type"], "userauthenticationmethods", - {"config": {**user_authentication_dict, "auth_name": user_authentication_name}}, + { + "config": { + **user_authentication_dict, + "auth_name": user_authentication_name, + } + }, ) ) diff --git a/src/user_workspaces_server/controllers/jobtypes/appyter_job.py b/src/user_workspaces_server/controllers/jobtypes/appyter_job.py index c9e25f8d..82c1b126 100644 --- a/src/user_workspaces_server/controllers/jobtypes/appyter_job.py +++ b/src/user_workspaces_server/controllers/jobtypes/appyter_job.py @@ -34,7 +34,9 @@ def get_script(self, template_params=None): return script def status_check(self, job_model): - resource = apps.get_app_config("user_workspaces_server").available_resources[job_model.resource_name] + resource = apps.get_app_config("user_workspaces_server").available_resources[ + job_model.resource_name + ] if job_model.status == models.Job.Status.FAILED: return { diff --git a/src/user_workspaces_server/controllers/jobtypes/jupyter_lab_job.py b/src/user_workspaces_server/controllers/jobtypes/jupyter_lab_job.py index 0f11347e..d46f164e 100644 --- a/src/user_workspaces_server/controllers/jobtypes/jupyter_lab_job.py +++ b/src/user_workspaces_server/controllers/jobtypes/jupyter_lab_job.py @@ -29,7 +29,9 @@ def get_script(self, template_params=None): def status_check(self, job_model): output_file_name = f"JupyterLabJob_{job_model.id}_output.log" - resource = apps.get_app_config("user_workspaces_server").available_resources[job_model.resource_name] + resource = apps.get_app_config("user_workspaces_server").available_resources[ + job_model.resource_name + ] if job_model.status == models.Job.Status.FAILED: return { diff --git a/src/user_workspaces_server/controllers/jobtypes/yac_job.py b/src/user_workspaces_server/controllers/jobtypes/yac_job.py index 8fa0dcf2..bf71eb06 100644 --- a/src/user_workspaces_server/controllers/jobtypes/yac_job.py +++ b/src/user_workspaces_server/controllers/jobtypes/yac_job.py @@ -42,7 +42,9 @@ def get_script(self, template_params=None): return script def status_check(self, job_model): - resource = apps.get_app_config("user_workspaces_server").available_resources[job_model.resource_name] + resource = apps.get_app_config("user_workspaces_server").available_resources[ + job_model.resource_name + ] if job_model.status == models.Job.Status.FAILED: return { diff --git a/src/user_workspaces_server/services/resource_selector.py b/src/user_workspaces_server/services/resource_selector.py index 8c52d765..bebc76af 100644 --- a/src/user_workspaces_server/services/resource_selector.py +++ b/src/user_workspaces_server/services/resource_selector.py @@ -186,7 +186,7 @@ def _score_resources( # Factor 5: Current load — scale score down by utilization so a # fully-loaded resource can never beat an idle one on priority alone. utilization = self._get_utilization(resource_name, resource) - score *= (1 - utilization) + score *= 1 - utilization scored.append( { From 8f8fea8ff6c88ab11500c000e86adadda41067e7 Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Tue, 30 Jun 2026 09:53:14 -0400 Subject: [PATCH 19/22] chore: syntax --- src/user_workspaces_server/views/workspace_view.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/user_workspaces_server/views/workspace_view.py b/src/user_workspaces_server/views/workspace_view.py index 8b25ce23..37fb9388 100644 --- a/src/user_workspaces_server/views/workspace_view.py +++ b/src/user_workspaces_server/views/workspace_view.py @@ -15,8 +15,8 @@ from user_workspaces_server import models, utils from user_workspaces_server.exceptions import WorkspaceClientException from user_workspaces_server.services.resource_selector import ( - ResourceSelector, ResourceSelectionError, + ResourceSelector, ) from user_workspaces_server.tasks import async_update_workspace From b38e0c7b9731f57fed61f26ae754d1f92bc68d0f Mon Sep 17 00:00:00 2001 From: Juan Muerto Date: Tue, 30 Jun 2026 09:54:46 -0400 Subject: [PATCH 20/22] fix: update v7-next to v7 --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 044b4ec6..2bc62827 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -28,6 +28,6 @@ jobs: run: | echo $(git describe --always --dirty --abbrev) > BUILD - name: Git Auto Commit - uses: stefanzweifel/git-auto-commit-action@v7-next + uses: stefanzweifel/git-auto-commit-action@v7 with: commit_message: "Generate new BUILD file" From 0307668adc18d3ff7108e53c6d1c018e00041587 Mon Sep 17 00:00:00 2001 From: jpuerto-psc <68066250+jpuerto-psc@users.noreply.github.com> Date: Tue, 30 Jun 2026 13:54:59 +0000 Subject: [PATCH 21/22] Generate new BUILD file --- BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD b/BUILD index 0be6159b..5b546c9d 100644 --- a/BUILD +++ b/BUILD @@ -1 +1 @@ -3f3e66d +409b45f From 84a4f13b8fa76bbad6c35e8573d04c79caf011cc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 30 Jun 2026 13:57:24 +0000 Subject: [PATCH 22/22] Generate new BUILD file --- BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD b/BUILD index 5b546c9d..6e7970a4 100644 --- a/BUILD +++ b/BUILD @@ -1 +1 @@ -409b45f +f5bdee6