diff --git a/src/kernelbot/api/main.py b/src/kernelbot/api/main.py index 808f5edf..fae33bc9 100644 --- a/src/kernelbot/api/main.py +++ b/src/kernelbot/api/main.py @@ -756,6 +756,105 @@ async def admin_update_problems( } +@app.post("/admin/backfill") +async def admin_backfill( + payload: dict, + _: Annotated[None, Depends(require_admin)], + db_context=Depends(get_db), +) -> dict: + """Queue a backfill: re-run top submissions against the current task version. + + After an update-problems changes the eval for a leaderboard, old run scores + become stale. This endpoint fetches the top N submissions (best per user) + from any previous task_version and re-submits each one so that new runs are + recorded with the current task_version. + + Payload: + leaderboard (str): Leaderboard name (required). + gpu (str): GPU type to backfill (required). + top_n (int): How many top submissions to re-run (default 100). + """ + leaderboard_name = payload.get("leaderboard") + gpu = payload.get("gpu") + top_n = payload.get("top_n", 100) + + if not leaderboard_name or not gpu: + raise HTTPException(status_code=400, detail="leaderboard and gpu are required") + + if not backend_instance: + raise HTTPException(status_code=500, detail="Backend not initialized") + + with db_context as db: + task_version = db.get_leaderboard_task_version(leaderboard_name) + if task_version <= 1: + return { + "status": "ok", + "message": "Leaderboard is still on task_version 1, nothing to backfill", + "queued": 0, + } + + submissions = db.get_top_submissions_for_backfill(leaderboard_name, gpu, top_n) + lb = db.get_leaderboard(leaderboard_name) + + if not submissions: + return { + "status": "ok", + "message": "No eligible submissions found from previous versions", + "queued": 0, + } + + if not background_submission_manager: + raise HTTPException( + status_code=500, + detail="Background submission manager not available", + ) + + queued_ids = [] + errors = [] + for sub in submissions: + try: + req = ProcessedSubmissionRequest( + code=sub["code"], + file_name=sub["file_name"], + user_id=sub["user_id"], + user_name=sub["user_name"], + leaderboard=leaderboard_name, + gpus=[gpu], + task=lb["task"], + secret_seed=lb.get("secret_seed", 0), + task_gpus=[gpu], + ) + with db_context as db: + new_sub_id = db.create_submission( + leaderboard=leaderboard_name, + file_name=sub["file_name"], + code=sub["code"], + user_id=sub["user_id"], + time=datetime.datetime.now(), + user_name=sub["user_name"], + ) + await background_submission_manager.enqueue( + req, SubmissionMode.LEADERBOARD, new_sub_id + ) + queued_ids.append(new_sub_id) + except Exception as e: + errors.append({ + "submission_id": sub["submission_id"], + "user_id": sub["user_id"], + "error": str(e), + }) + + return { + "status": "ok", + "leaderboard": leaderboard_name, + "gpu": gpu, + "task_version": task_version, + "queued": len(queued_ids), + "queued_submission_ids": queued_ids, + "errors": errors, + } + + @app.post("/admin/export-hf") async def admin_export_hf( payload: dict, diff --git a/src/kernelbot/cogs/admin_cog.py b/src/kernelbot/cogs/admin_cog.py index 8b21747a..2c47fa6d 100644 --- a/src/kernelbot/cogs/admin_cog.py +++ b/src/kernelbot/cogs/admin_cog.py @@ -135,6 +135,10 @@ def __init__(self, bot: "ClusterBot"): name="export-hf", description="Export competition data to Hugging Face dataset" )(self.export_to_hf) + self.backfill = bot.admin_group.command( + name="backfill", description="Re-run top submissions after eval change" + )(self.backfill) + self._scheduled_cleanup_temp_users.start() if env.HF_TOKEN: self._scheduled_hf_export.start() @@ -200,6 +204,114 @@ async def unban_user(self, interaction: discord.Interaction, user_id: str): interaction, f"User `{user_id}` not found.", ephemeral=True ) + @discord.app_commands.describe( + leaderboard_name="Name of the leaderboard to backfill", + gpu="GPU type to backfill", + top_n="Number of top submissions to re-run (default 100)", + ) + @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) + @app_commands.choices( + gpu=[app_commands.Choice(name=gpu.name, value=gpu.value) for gpu in GitHubGPU] + + [app_commands.Choice(name=gpu.name, value=gpu.value) for gpu in ModalGPU] + ) + @with_error_handling + async def backfill( + self, + interaction: discord.Interaction, + leaderboard_name: str, + gpu: str, + top_n: int = 100, + ): + if not await self.admin_check(interaction): + await send_discord_message( + interaction, "You need to have Admin permissions to run this command", ephemeral=True + ) + return + + await interaction.response.defer(ephemeral=True) + + with self.bot.leaderboard_db as db: + task_version = db.get_leaderboard_task_version(leaderboard_name) + if task_version <= 1: + await interaction.edit_original_response( + content=f"Leaderboard `{leaderboard_name}` is on task_version 1 — nothing to backfill." + ) + return + + submissions = db.get_top_submissions_for_backfill(leaderboard_name, gpu, top_n) + lb = db.get_leaderboard(leaderboard_name) + + if not submissions: + await interaction.edit_original_response( + content=f"No eligible submissions found for `{leaderboard_name}` ({gpu}) from previous versions." + ) + return + + await interaction.edit_original_response( + content=( + f"**Backfill: {leaderboard_name} ({gpu}) v{task_version - 1} → v{task_version}**\n" + f"Found {len(submissions)} submissions to re-run\nQueued: 0/{len(submissions)}" + ) + ) + + queued = 0 + errors = 0 + for sub in submissions: + try: + from libkernelbot.submission import ProcessedSubmissionRequest + + req = ProcessedSubmissionRequest( + code=sub["code"], + file_name=sub["file_name"], + user_id=sub["user_id"], + user_name=sub["user_name"], + leaderboard=leaderboard_name, + gpus=[gpu], + task=lb["task"], + secret_seed=lb.get("secret_seed", 0), + task_gpus=[gpu], + ) + from libkernelbot.background_submission_manager import BackgroundSubmissionManagerReporter + from libkernelbot.consts import SubmissionMode + + with self.bot.leaderboard_db as db: + new_sub_id = db.create_submission( + leaderboard=leaderboard_name, + file_name=sub["file_name"], + code=sub["code"], + user_id=sub["user_id"], + time=datetime.now(tz=timezone.utc), + user_name=sub["user_name"], + ) + + reporter = BackgroundSubmissionManagerReporter(new_sub_id, self.bot.backend) + # Fire and forget — don't block on each submission + self.bot.loop.create_task( + self.bot.backend.submit_full(req, SubmissionMode.LEADERBOARD, reporter, new_sub_id) + ) + queued += 1 + except Exception as e: + logger.error("Backfill error for submission %s: %s", sub["submission_id"], e) + errors += 1 + + if queued % 5 == 0 or queued + errors == len(submissions): + await interaction.edit_original_response( + content=( + f"**Backfill: {leaderboard_name} ({gpu}) v{task_version - 1} → v{task_version}**\n" + f"Found {len(submissions)} submissions to re-run\n" + f"Queued: {queued}/{len(submissions)}\n" + f"Errors: {errors}" + ) + ) + + await interaction.edit_original_response( + content=( + f"**Backfill complete: {leaderboard_name} ({gpu}) v{task_version - 1} → v{task_version}**\n" + f"Queued: {queued}/{len(submissions)}\n" + f"Errors: {errors}" + ) + ) + @discord.app_commands.describe( directory="Directory of the kernel definition. Also used as the leaderboard's name", gpu="The GPU to submit to. Leave empty for interactive selection/multiple GPUs", diff --git a/src/libkernelbot/leaderboard_db.py b/src/libkernelbot/leaderboard_db.py index 650599af..bf8760bf 100644 --- a/src/libkernelbot/leaderboard_db.py +++ b/src/libkernelbot/leaderboard_db.py @@ -137,16 +137,28 @@ def update_leaderboard( task = definition.task try: lb_id = self.get_leaderboard_id(name) + + # Check if the task actually changed; if so, bump task_version + self.cursor.execute( + "SELECT task FROM leaderboard.leaderboard WHERE id = %s", + (lb_id,), + ) + old_task_json = self.cursor.fetchone()[0] + old_task = LeaderboardTask.from_dict(old_task_json) + task_changed = old_task != task + self.cursor.execute( """ UPDATE leaderboard.leaderboard - SET deadline = %s, task = %s, description = %s + SET deadline = %s, task = %s, description = %s, + task_version = task_version + CASE WHEN %s THEN 1 ELSE 0 END WHERE id = %s; """, ( deadline.astimezone(datetime.timezone.utc), task.to_str(), definition.description, + task_changed, lb_id, ), ) @@ -437,6 +449,7 @@ def create_submission_run( compilation: Optional[CompileResult], result: RunResult, system: SystemInfo, + task_version: Optional[int] = None, ): try: if compilation is not None: @@ -445,11 +458,15 @@ def create_submission_run( # check validity self.cursor.execute( """ - SELECT done FROM leaderboard.submission WHERE id = %s + SELECT s.done, l.task_version + FROM leaderboard.submission s + JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id + WHERE s.id = %s """, (submission,), ) - if self.cursor.fetchone()[0]: + row = self.cursor.fetchone() + if row[0]: logger.error( "Submission '%s' is already marked as done when trying to add %s run.", submission, @@ -460,6 +477,9 @@ def create_submission_run( "but submission was already marked as done." ) + # Use provided task_version or fall back to leaderboard's current version + run_task_version = task_version if task_version is not None else row[1] + meta = { k: result.__dict__[k] for k in ["stdout", "stderr", "success", "exit_code", "command", "duration"] @@ -467,9 +487,10 @@ def create_submission_run( self.cursor.execute( """ INSERT INTO leaderboard.runs (submission_id, start_time, end_time, mode, - secret, runner, score, passed, compilation, meta, result, system_info + secret, runner, score, passed, compilation, meta, result, system_info, + task_version ) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, ( submission, @@ -484,6 +505,7 @@ def create_submission_run( json.dumps(meta), json.dumps(result.result), json.dumps(dataclasses.asdict(system)), + run_task_version, ), ) self.connection.commit() @@ -852,6 +874,7 @@ def get_leaderboard_submissions( AND NOT r.secret AND r.score IS NOT NULL AND r.passed + AND r.task_version = l.task_version AND s.user_id = %s ORDER BY r.score ASC LIMIT %s OFFSET %s @@ -874,6 +897,7 @@ def get_leaderboard_submissions( JOIN leaderboard.user_info ui ON s.user_id = ui.id WHERE l.name = %s AND r.runner = %s AND NOT r.secret AND r.score IS NOT NULL AND r.passed + AND r.task_version = l.task_version ORDER BY s.user_id, r.score ASC ) SELECT @@ -922,6 +946,84 @@ def get_leaderboard_submissions( return result + def get_top_submissions_for_backfill( + self, + leaderboard_name: str, + gpu_name: str, + top_n: int = 100, + ) -> list[dict]: + """Get the top N submissions (best per user) from any previous task_version. + + Returns dicts with: submission_id, user_id, user_name, file_name, code, score, task_version + """ + self.cursor.execute( + """ + WITH best_submissions AS ( + SELECT DISTINCT ON (s.user_id) + s.id as submission_id, + s.user_id, + s.file_name, + r.score, + r.task_version + FROM leaderboard.runs r + JOIN leaderboard.submission s ON r.submission_id = s.id + JOIN leaderboard.leaderboard l ON s.leaderboard_id = l.id + WHERE l.name = %s AND r.runner = %s AND NOT r.secret + AND r.score IS NOT NULL AND r.passed + AND r.task_version < l.task_version + AND NOT EXISTS ( + SELECT 1 FROM leaderboard.runs r2 + JOIN leaderboard.submission s2 ON r2.submission_id = s2.id + WHERE s2.user_id = s.user_id + AND s2.leaderboard_id = l.id + AND r2.runner = r.runner + AND r2.task_version = l.task_version + AND r2.passed AND NOT r2.secret + ) + ORDER BY s.user_id, r.score ASC + ) + SELECT + bs.submission_id, + bs.user_id, + bs.file_name, + convert_from(cf.code, 'UTF8') as code, + bs.score, + bs.task_version, + ui.user_name + FROM best_submissions bs + JOIN leaderboard.submission s ON bs.submission_id = s.id + JOIN leaderboard.code_files cf ON s.code_id = cf.id + JOIN leaderboard.user_info ui ON bs.user_id = ui.id + ORDER BY bs.score ASC + LIMIT %s + """, + (leaderboard_name, gpu_name, top_n), + ) + + return [ + { + "submission_id": row[0], + "user_id": row[1], + "file_name": row[2], + "code": row[3], + "score": row[4], + "task_version": row[5], + "user_name": row[6], + } + for row in self.cursor.fetchall() + ] + + def get_leaderboard_task_version(self, leaderboard_name: str) -> int: + """Get the current task_version for a leaderboard.""" + self.cursor.execute( + "SELECT task_version FROM leaderboard.leaderboard WHERE name = %s", + (leaderboard_name,), + ) + row = self.cursor.fetchone() + if row is None: + raise LeaderboardDoesNotExist(leaderboard_name) + return row[0] + def generate_stats(self, last_day: bool, leaderboard_name: Optional[str] = None): try: return self._generate_stats(last_day, leaderboard_name) @@ -1250,6 +1352,7 @@ def get_leaderboard_submission_count( AND NOT r.secret AND r.score IS NOT NULL AND r.passed + AND r.task_version = l.task_version AND s.user_id = %s """ args = (leaderboard_name, gpu_name, user_id) @@ -1264,6 +1367,7 @@ def get_leaderboard_submission_count( AND NOT r.secret AND r.score IS NOT NULL AND r.passed + AND r.task_version = l.task_version """ args = (leaderboard_name, gpu_name) diff --git a/src/migrations/20260313_01_backfill-add-task-version.py b/src/migrations/20260313_01_backfill-add-task-version.py new file mode 100644 index 00000000..4a457b2d --- /dev/null +++ b/src/migrations/20260313_01_backfill-add-task-version.py @@ -0,0 +1,46 @@ +""" +add-task-version +""" + +from yoyo import step + +__depends__ = {'20260318_01_ban-user'} + + +steps = [ + step( + """ + ALTER TABLE leaderboard.leaderboard + ADD COLUMN task_version INT NOT NULL DEFAULT 1; + """, + """ + ALTER TABLE leaderboard.leaderboard + DROP COLUMN task_version; + """ + ), + step( + """ + ALTER TABLE leaderboard.runs + ADD COLUMN task_version INT NOT NULL DEFAULT 1; + """, + """ + ALTER TABLE leaderboard.runs + DROP COLUMN task_version; + """ + ), + # Update the partial index to include task_version for efficient filtering + step( + """ + DROP INDEX IF EXISTS leaderboard.idx_runs_valid_scores; + CREATE INDEX idx_runs_valid_scores + ON leaderboard.runs (submission_id, runner, score, task_version) + WHERE NOT secret AND score IS NOT NULL AND passed; + """, + """ + DROP INDEX IF EXISTS leaderboard.idx_runs_valid_scores; + CREATE INDEX idx_runs_valid_scores + ON leaderboard.runs (submission_id, runner, score) + WHERE NOT secret AND score IS NOT NULL AND passed; + """ + ), +] diff --git a/tests/test_admin_api.py b/tests/test_admin_api.py index e3130606..c5473cf6 100644 --- a/tests/test_admin_api.py +++ b/tests/test_admin_api.py @@ -815,3 +815,112 @@ def test_delete_rate_limit_invalid_category(self, test_client): headers={"Authorization": "Bearer test_token"}, ) assert response.status_code == 400 + + +class TestAdminBackfill: + """Test admin backfill endpoint.""" + + def _setup_db_mock(self, mock_backend): + mock_backend.db.__enter__ = MagicMock(return_value=mock_backend.db) + mock_backend.db.__exit__ = MagicMock(return_value=None) + + def test_backfill_requires_auth(self, test_client): + """POST /admin/backfill requires authorization.""" + response = test_client.post("/admin/backfill", json={}) + assert response.status_code == 401 + + def test_backfill_requires_leaderboard_and_gpu(self, test_client): + """POST /admin/backfill returns 400 when leaderboard or gpu missing.""" + response = test_client.post( + "/admin/backfill", + headers={"Authorization": "Bearer test_token"}, + json={"leaderboard": "test-lb"}, + ) + assert response.status_code == 400 + assert "leaderboard and gpu are required" in response.json()["detail"] + + response = test_client.post( + "/admin/backfill", + headers={"Authorization": "Bearer test_token"}, + json={"gpu": "A100"}, + ) + assert response.status_code == 400 + + def test_backfill_version_1_noop(self, test_client, mock_backend): + """POST /admin/backfill returns queued=0 when task_version is 1.""" + self._setup_db_mock(mock_backend) + mock_backend.db.get_leaderboard_task_version = MagicMock(return_value=1) + + response = test_client.post( + "/admin/backfill", + headers={"Authorization": "Bearer test_token"}, + json={"leaderboard": "test-lb", "gpu": "A100"}, + ) + assert response.status_code == 200 + data = response.json() + assert data["queued"] == 0 + assert "task_version 1" in data["message"] + + def test_backfill_no_candidates(self, test_client, mock_backend): + """POST /admin/backfill returns queued=0 when no candidates found.""" + self._setup_db_mock(mock_backend) + mock_backend.db.get_leaderboard_task_version = MagicMock(return_value=2) + mock_backend.db.get_top_submissions_for_backfill = MagicMock(return_value=[]) + mock_backend.db.get_leaderboard = MagicMock(return_value={"task": {}, "secret_seed": 0}) + + response = test_client.post( + "/admin/backfill", + headers={"Authorization": "Bearer test_token"}, + json={"leaderboard": "test-lb", "gpu": "A100"}, + ) + assert response.status_code == 200 + data = response.json() + assert data["queued"] == 0 + + def test_backfill_queues_submissions(self, test_client, mock_backend): + """POST /admin/backfill queues submissions and returns count.""" + self._setup_db_mock(mock_backend) + mock_backend.db.get_leaderboard_task_version = MagicMock(return_value=2) + mock_backend.db.get_top_submissions_for_backfill = MagicMock(return_value=[ + { + "submission_id": 1, + "user_id": "100", + "user_name": "alice", + "file_name": "sol.py", + "code": "print(1)", + "score": 1.0, + "task_version": 1, + }, + { + "submission_id": 2, + "user_id": "200", + "user_name": "bob", + "file_name": "sol.py", + "code": "print(2)", + "score": 2.0, + "task_version": 1, + }, + ]) + mock_backend.db.get_leaderboard = MagicMock(return_value={"task": {}, "secret_seed": 0}) + mock_backend.db.create_submission = MagicMock(side_effect=[10, 11]) + + with patch('kernelbot.api.main.background_submission_manager') as mock_bsm: + mock_bsm.enqueue = MagicMock(return_value=None) + + async def mock_enqueue(*args, **kwargs): + pass + + mock_bsm.enqueue = mock_enqueue + + response = test_client.post( + "/admin/backfill", + headers={"Authorization": "Bearer test_token"}, + json={"leaderboard": "test-lb", "gpu": "A100"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["queued"] == 2 + assert data["leaderboard"] == "test-lb" + assert data["task_version"] == 2 + assert len(data["queued_submission_ids"]) == 2 diff --git a/tests/test_leaderboard_db.py b/tests/test_leaderboard_db.py index 9c1160f8..eeeba948 100644 --- a/tests/test_leaderboard_db.py +++ b/tests/test_leaderboard_db.py @@ -1136,3 +1136,149 @@ def test_check_rate_limit_categories_independent(database, submit_leaderboard): result = db.check_rate_limit("submit-leaderboard", "123", "test") assert result["allowed"] is False + +# -------------------------------------------------------------------------- +# Task versioning tests +# -------------------------------------------------------------------------- + + +def test_task_version_starts_at_one(database, submit_leaderboard): + """New leaderboards start at task_version 1.""" + with database as db: + version = db.get_leaderboard_task_version("submit-leaderboard") + assert version == 1 + + +def test_task_version_bumps_on_task_change(database, task_directory): + """Updating a leaderboard with a different task bumps task_version.""" + from libkernelbot.task import make_task_definition + + _submit_leaderboard(database, task_directory) + + # Create a modified task.yml + modified_yaml = (task_directory / "task.yml").read_text().replace("input_size: 1000", "input_size: 2000") + (task_directory / "task.yml").write_text(modified_yaml) + new_definition = make_task_definition(task_directory / "task.yml") + + deadline = datetime.datetime.now(tz=datetime.timezone.utc) + datetime.timedelta(days=1) + with database as db: + db.update_leaderboard("submit-leaderboard", deadline, new_definition) + version = db.get_leaderboard_task_version("submit-leaderboard") + assert version == 2 + + +def test_task_version_unchanged_on_same_task(database, task_directory): + """Updating a leaderboard with the same task does not bump task_version.""" + from libkernelbot.task import make_task_definition + + _submit_leaderboard(database, task_directory) + definition = make_task_definition(task_directory / "task.yml") + + deadline = datetime.datetime.now(tz=datetime.timezone.utc) + datetime.timedelta(days=1) + with database as db: + db.update_leaderboard("submit-leaderboard", deadline, definition) + version = db.get_leaderboard_task_version("submit-leaderboard") + assert version == 1 + + +def test_rankings_filter_by_task_version(database, task_directory): + """Rankings only include runs from the current task_version.""" + from libkernelbot.task import make_task_definition + + _submit_leaderboard(database, task_directory) + + with database as db: + # Create a submission and run at v1 + sub_id = db.create_submission("submit-leaderboard", "test.py", 123, "code1", datetime.datetime.now()) + _create_submission_run(db, sub_id, runner="A100", score=1.0, mode="leaderboard") + + # Verify the run shows up in rankings at v1 + count = db.get_leaderboard_submission_count("submit-leaderboard", "A100", None) + assert count == 1 + + # Bump task_version + modified_yaml = (task_directory / "task.yml").read_text().replace("input_size: 1000", "input_size: 3000") + (task_directory / "task.yml").write_text(modified_yaml) + new_definition = make_task_definition(task_directory / "task.yml") + deadline = datetime.datetime.now(tz=datetime.timezone.utc) + datetime.timedelta(days=1) + with database as db: + db.update_leaderboard("submit-leaderboard", deadline, new_definition) + # Rankings should now be empty (v1 runs filtered out) + count = db.get_leaderboard_submission_count("submit-leaderboard", "A100", None) + assert count == 0 + + +def test_backfill_candidates_found(database, task_directory): + """Backfill finds submissions from previous task_version.""" + from libkernelbot.task import make_task_definition + + _submit_leaderboard(database, task_directory) + + with database as db: + sub_id = db.create_submission("submit-leaderboard", "test.py", 123, "code1", datetime.datetime.now()) + _create_submission_run(db, sub_id, runner="A100", score=1.0, mode="leaderboard") + + # Bump version + modified_yaml = (task_directory / "task.yml").read_text().replace("input_size: 1000", "input_size: 4000") + (task_directory / "task.yml").write_text(modified_yaml) + new_definition = make_task_definition(task_directory / "task.yml") + deadline = datetime.datetime.now(tz=datetime.timezone.utc) + datetime.timedelta(days=1) + with database as db: + db.update_leaderboard("submit-leaderboard", deadline, new_definition) + candidates = db.get_top_submissions_for_backfill("submit-leaderboard", "A100") + assert len(candidates) == 1 + assert candidates[0]["user_id"] == "123" + assert candidates[0]["code"] == "code1" + + +def test_backfill_candidates_exclude_current_version(database, task_directory): + """Backfill excludes users who already have runs at the current task_version.""" + from libkernelbot.task import make_task_definition + + _submit_leaderboard(database, task_directory) + + with database as db: + sub_id = db.create_submission("submit-leaderboard", "test.py", 123, "code1", datetime.datetime.now()) + _create_submission_run(db, sub_id, runner="A100", score=1.0, mode="leaderboard") + + # Bump version + modified_yaml = (task_directory / "task.yml").read_text().replace("input_size: 1000", "input_size: 5000") + (task_directory / "task.yml").write_text(modified_yaml) + new_definition = make_task_definition(task_directory / "task.yml") + deadline = datetime.datetime.now(tz=datetime.timezone.utc) + datetime.timedelta(days=1) + with database as db: + db.update_leaderboard("submit-leaderboard", deadline, new_definition) + + # Create a new submission at v2 + sub_id2 = db.create_submission("submit-leaderboard", "test2.py", 123, "code2", datetime.datetime.now()) + _create_submission_run(db, sub_id2, runner="A100", score=0.5, mode="leaderboard") + + # User 123 already has a v2 run, so no backfill candidates + candidates = db.get_top_submissions_for_backfill("submit-leaderboard", "A100") + assert len(candidates) == 0 + + +def test_create_submission_run_gets_current_version(database, task_directory): + """Runs auto-get the leaderboard's current task_version when none is specified.""" + from libkernelbot.task import make_task_definition + + _submit_leaderboard(database, task_directory) + + # Bump version first + modified_yaml = (task_directory / "task.yml").read_text().replace("input_size: 1000", "input_size: 6000") + (task_directory / "task.yml").write_text(modified_yaml) + new_definition = make_task_definition(task_directory / "task.yml") + deadline = datetime.datetime.now(tz=datetime.timezone.utc) + datetime.timedelta(days=1) + with database as db: + db.update_leaderboard("submit-leaderboard", deadline, new_definition) + version = db.get_leaderboard_task_version("submit-leaderboard") + assert version == 2 + + # Create a submission and run — should auto-get v2 + sub_id = db.create_submission("submit-leaderboard", "test.py", 456, "code_v2", datetime.datetime.now()) + _create_submission_run(db, sub_id, runner="A100", score=1.0, mode="leaderboard") + + # This run should show in rankings (it's at current version) + count = db.get_leaderboard_submission_count("submit-leaderboard", "A100", None) + assert count == 1 +