Fix inference fallback, stabilize match sync, and add regression coverage

Dieg0Code · Dieg0Code · commit c781f564a79c · 2026-03-03T00:00:03.000-03:00
diff --git a/.env.example b/.env.example
@@ -45,6 +45,7 @@ INFERENCE_MODE_DEFAULT="fast"   # fast | strong
 INFERENCE_DEVICE="auto"         # auto | cpu | cuda
 INFERENCE_MCTS_SIMS=160
 INFERENCE_C_PUCT=1.5
+INFERENCE_FALLBACK_HEURISTIC_LEVEL="easy"  # easy | normal | hard
 
 # Auth / JWT
 AUTH_JWT_SECRET="change_me_with_a_long_random_secret"
diff --git a/src/api/config/settings.py b/src/api/config/settings.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from functools import lru_cache
+from typing import Literal
 from urllib.parse import quote, urlsplit, urlunsplit
 
 from pydantic import computed_field, model_validator
@@ -60,6 +61,7 @@ class Settings(BaseSettings):
     inference_mcts_sims: int = 160
     inference_c_puct: float = 1.5
     inference_prefer_onnx: bool = True
+    inference_fallback_heuristic_level: Literal["easy", "normal", "hard"] = "easy"
 
     # Auth/JWT
     auth_jwt_secret: str = ""
diff --git a/src/api/modules/gameplay/router.py b/src/api/modules/gameplay/router.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import Annotated
+from typing import Annotated, Literal
 from uuid import UUID
 
 import numpy as np
@@ -19,6 +19,7 @@
 
 from agents.heuristic import heuristic_move
 from agents.random_agent import random_move
+from api.config import Settings, get_settings
 from api.db.enums import GameStatus
 from api.db.models import Game, User
 from api.deps.auth import get_auth_service_dep, get_current_user_dep
@@ -48,6 +49,14 @@
 CURRENT_USER_DEP = Depends(get_current_user_dep)
 AUTH_SERVICE_DEP = Depends(get_auth_service_dep)
 logger = logging.getLogger(__name__)
+FALLBACK_MODE_BY_LEVEL: dict[
+    Literal["easy", "normal", "hard"],
+    Literal["heuristic_easy", "heuristic_normal", "heuristic_hard"],
+] = {
+    "easy": "heuristic_easy",
+    "normal": "heuristic_normal",
+    "hard": "heuristic_hard",
+}
 
 
 def _resolve_inference_service(request: Request) -> InferenceService:
@@ -59,6 +68,13 @@ def _resolve_inference_service(request: Request) -> InferenceService:
     return provider()
 
 
+def _resolve_settings(request: Request) -> Settings:
+    state_settings = getattr(request.app.state, "settings", None)
+    if isinstance(state_settings, Settings):
+        return state_settings
+    return get_settings()
+
+
 async def _to_game_response(
     gameplay_service: GameplayService,
     game: Game,
@@ -154,26 +170,29 @@ def post_move(
         except HTTPException as exc:
             if exc.status_code != status.HTTP_503_SERVICE_UNAVAILABLE:
                 raise
+            settings = _resolve_settings(http_request)
+            fallback_level = settings.inference_fallback_heuristic_level
+            fallback_mode = FALLBACK_MODE_BY_LEVEL[fallback_level]
             # Keep PvE matches playable when model artifacts are missing in runtime.
             logger.warning(
-                "Inference unavailable on /gameplay/move; falling back to heuristic_hard",
-                extra={"detail": exc.detail},
+                "Inference unavailable on /gameplay/move; falling back to heuristic",
+                extra={"detail": exc.detail, "fallback_level": fallback_level},
             )
             rng = np.random.default_rng()
-            fallback_move = heuristic_move(board=board, rng=rng, level="hard")
+            fallback_move = heuristic_move(board=board, rng=rng, level=fallback_level)
             if fallback_move is None:
                 return MoveResponse(
                     move=None,
                     action_idx=ACTION_SPACE.pass_index,
                     value=0.0,
-                    mode="heuristic_hard",
+                    mode=fallback_mode,
                 )
             r1, c1, r2, c2 = fallback_move
             return MoveResponse(
                 move=MovePayload(r1=r1, c1=c1, r2=r2, c2=c2),
                 action_idx=ACTION_SPACE.encode(fallback_move),
                 value=0.0,
-                mode="heuristic_hard",
+                mode=fallback_mode,
             )
 
     rng = np.random.default_rng()
diff --git a/src/api/modules/matches/router.py b/src/api/modules/matches/router.py
@@ -35,6 +35,7 @@
 MATCHES_SERVICE_DEP = Depends(get_matches_service_dep)
 CURRENT_USER_DEP = Depends(get_current_user_dep)
 AUTH_SERVICE_DEP = Depends(get_auth_service_dep)
+INVITATIONS_WS_REFRESH_S = 8.0
 
 
 @router.post(
@@ -233,8 +234,11 @@ async def invitations_ws(
                 }
             )
             try:
-                # Lower polling pressure on DB while still keeping invitation UI responsive.
-                await asyncio.wait_for(websocket.receive_text(), timeout=2.5)
+                # Lower DB pressure: invitation updates do not need sub-second cadence.
+                await asyncio.wait_for(
+                    websocket.receive_text(),
+                    timeout=INVITATIONS_WS_REFRESH_S,
+                )
             except (TimeoutError, asyncio.TimeoutError):
                 continue
     except (WebSocketDisconnect, asyncio.TimeoutError):
diff --git a/src/inference/service.py b/src/inference/service.py
@@ -14,6 +14,8 @@
 from game.types import Move
 
 if TYPE_CHECKING:
+    import torch.nn as nn
+
     from engine.mcts import MCTS
 
 InferenceMode = Literal["fast", "strong"]
@@ -57,7 +59,9 @@ def run(self, output_names: list[str] | None, input_feed: dict[str, Any]) -> lis
 
 
 class _SystemLike(Protocol):
-    model: Any
+    @property
+    def model(self) -> nn.Module:
+        ...
 
     def eval(self) -> _SystemLike:
         ...
@@ -69,6 +73,28 @@ def load_state_dict(self, state_dict: dict[str, object]) -> object:
         ...
 
 
+class _CheckpointSystemAdapter:
+    """Minimal runtime wrapper to use plain torch modules as inference systems."""
+
+    def __init__(self, model: nn.Module) -> None:
+        self._model = model
+
+    @property
+    def model(self) -> nn.Module:
+        return self._model
+
+    def eval(self) -> _CheckpointSystemAdapter:
+        self._model.eval()
+        return self
+
+    def to(self, device: str) -> _CheckpointSystemAdapter:
+        self._model.to(device)
+        return self
+
+    def load_state_dict(self, state_dict: dict[str, object]) -> object:
+        return self._model.load_state_dict(state_dict)
+
+
 @lru_cache(maxsize=1)
 def _get_torch_module() -> ModuleType | None:
     """Import torch lazily so API startup does not hard-fail in lightweight runtimes."""
@@ -165,17 +191,31 @@ def _extract_arch_kwargs(raw_kwargs: ModelInitKwargs) -> dict[str, Any]:
         allowed = ("d_model", "nhead", "num_layers", "dim_feedforward", "dropout")
         return {key: raw_kwargs[key] for key in allowed if key in raw_kwargs}
 
+    @staticmethod
+    def _extract_model_state_dict(state_dict: dict[str, Any]) -> dict[str, Any]:
+        # Training checkpoints prefix model params with `model.` (Lightning module layout).
+        # Runtime inference uses the raw network, so we strip this prefix when present.
+        if all(key.startswith("model.") for key in state_dict):
+            return {key.removeprefix("model."): value for key, value in state_dict.items()}
+        return state_dict
+
     def _build_legacy_system(self) -> _SystemLike:
         from inference.legacy_model import LegacyAtaxxSystem
 
         return LegacyAtaxxSystem(**self._extract_arch_kwargs(self.model_kwargs))
 
-    def _load_system(self) -> _SystemLike:
-        from model.system import AtaxxZero
+    def _build_spatial_system(self) -> _SystemLike:
+        from model.transformer import AtaxxTransformerNet
+
+        model = AtaxxTransformerNet(**self._extract_arch_kwargs(self.model_kwargs))
+        return _CheckpointSystemAdapter(model)
 
+    def _load_system(self) -> _SystemLike:
         torch_module = self._require_torch()
         ckpt = self.checkpoint_path
         if ckpt.suffix == ".ckpt":
+            from model.system import AtaxxZero
+
             try:
                 return AtaxxZero.load_from_checkpoint(str(ckpt), map_location=self.device)
             except RuntimeError as exc:
@@ -191,9 +231,9 @@ def _load_system(self) -> _SystemLike:
         if not isinstance(state_dict_obj, dict):
             raise ValueError("Checkpoint dictionary must contain key 'state_dict'.")
 
-        system = AtaxxZero(**self.model_kwargs)
+        system = self._build_spatial_system()
         try:
-            system.load_state_dict(state_dict_obj)
+            system.load_state_dict(self._extract_model_state_dict(state_dict_obj))
         except RuntimeError as exc:
             if self._is_legacy_state_dict(state_dict_obj):
                 legacy_system = self._build_legacy_system()
diff --git a/tests/test_api_move.py b/tests/test_api_move.py
@@ -10,6 +10,7 @@
 sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src"))
 
 from api.app import create_app
+from api.config import Settings
 from api.deps.inference import get_inference_service_dep
 from api.modules.gameplay.schemas import MoveRequest
 from game.actions import ACTION_SPACE
@@ -119,6 +120,27 @@ def _unavailable_inference() -> _StubInferenceService:
         response = client.post("/api/v1/gameplay/move", json=payload)
         self.assertEqual(response.status_code, 200)
 
+        body = response.json()
+        self.assertEqual(body["mode"], "heuristic_easy")
+        self.assertIsInstance(body["action_idx"], int)
+
+    def test_move_endpoint_fallback_level_honors_settings(self) -> None:
+        app = create_app(settings=Settings(inference_fallback_heuristic_level="hard"))
+
+        def _unavailable_inference() -> _StubInferenceService:
+            raise HTTPException(
+                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                detail="Inference unavailable in test",
+            )
+
+        app.dependency_overrides[get_inference_service_dep] = _unavailable_inference
+        client = TestClient(app)
+
+        board = AtaxxBoard()
+        payload = MoveRequest(board=board_to_state(board), mode="fast").model_dump()
+        response = client.post("/api/v1/gameplay/move", json=payload)
+        self.assertEqual(response.status_code, 200)
+
         body = response.json()
         self.assertEqual(body["mode"], "heuristic_hard")
         self.assertIsInstance(body["action_idx"], int)
diff --git a/tests/test_inference_service.py b/tests/test_inference_service.py
@@ -17,6 +17,7 @@
 from inference.legacy_model import LegacyAtaxxSystem
 from inference.service import InferenceService
 from model.system import AtaxxZero
+from model.transformer import AtaxxTransformerNet
 
 
 class TestInferenceService(unittest.TestCase):
@@ -58,6 +59,49 @@ def test_fast_mode_returns_legal_action(self) -> None:
             self.assertIn(result.move, legal_moves)
             self.assertTrue(-1.0 <= result.value <= 1.0)
 
+    def test_pt_checkpoint_load_does_not_require_lightning_runtime(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AtaxxTransformerNet(
+                d_model=64,
+                nhead=8,
+                num_layers=2,
+                dim_feedforward=128,
+                dropout=0.0,
+            )
+            ckpt_path = Path(tmp_dir) / "spatial.pt"
+            state_dict = {f"model.{key}": value for key, value in model.state_dict().items()}
+            torch.save({"state_dict": state_dict}, ckpt_path)
+
+            native_import = __import__
+
+            def guarded_import(
+                name: str,
+                globals_: dict[str, object] | None = None,
+                locals_: dict[str, object] | None = None,
+                fromlist: tuple[str, ...] = (),
+                level: int = 0,
+            ) -> object:
+                if name.startswith("pytorch_lightning"):
+                    raise ModuleNotFoundError("pytorch_lightning blocked by test")
+                return native_import(name, globals_, locals_, fromlist, level)
+
+            with patch("builtins.__import__", side_effect=guarded_import):
+                service = InferenceService(
+                    checkpoint_path=ckpt_path,
+                    device="cpu",
+                    model_kwargs={
+                        "d_model": 64,
+                        "nhead": 8,
+                        "num_layers": 2,
+                        "dim_feedforward": 128,
+                        "dropout": 0.0,
+                    },
+                )
+                result = service.predict(AtaxxBoard(), mode="fast")
+
+            self.assertEqual(result.mode, "fast")
+            self.assertIsNotNone(result.move)
+
     def test_strong_mode_returns_legal_action(self) -> None:
         with tempfile.TemporaryDirectory() as tmp_dir:
             system = self._tiny_system()
diff --git a/web/src/pages/match/MatchPage.tsx b/web/src/pages/match/MatchPage.tsx
@@ -62,7 +62,7 @@ const AI_THINK_DELAY_MS = 460;
 const AI_PREVIEW_MS = 420;
 const INFECTION_STEP_MS = 90;
 const INFECTION_BURST_MS = 420;
-const OUTGOING_INVITE_POLL_MS = 2500;
+const OUTGOING_INVITE_POLL_MS = 4000;
 const UI_TICK_MS = 120;
 const INTRO_COUNTDOWN_START = 3;
 const HOVER_SFX_MIN_GAP_MS = 120;
@@ -383,6 +383,7 @@ export function MatchPage(): JSX.Element {
   const gameplayWsRef = useRef<WebSocket | null>(null);
   const lastWsPlyRef = useRef(-1);
   const persistQueueRef = useRef<Promise<void>>(Promise.resolve());
+  const latestBoardRef = useRef<BoardState>(board);
   const failedPersistOpsRef = useRef<PendingPersistOperation[]>([]);
   const unmountCleanupTriggeredRef = useRef(false);
   const unmountCleanupStateRef = useRef<{
@@ -416,6 +417,10 @@ export function MatchPage(): JSX.Element {
     }
   }, [accessToken]);
 
+  useEffect(() => {
+    latestBoardRef.current = board;
+  }, [board]);
+
   useEffect(() => {
     unmountCleanupStateRef.current = {
       accessToken,
@@ -1372,7 +1377,10 @@ export function MatchPage(): JSX.Element {
 
       if (event.move.board_after !== null) {
         const boardAfter = event.move.board_after as BoardState;
-        setBoard(boardAfter);
+        // Ignore stale snapshots: delayed WS frames used to overwrite a newer local board.
+        if (boardAfter.half_moves >= latestBoardRef.current.half_moves) {
+          setBoard(boardAfter);
+        }
       }
       const remoteMove =
         event.move.r1 === null || event.move.c1 === null || event.move.r2 === null || event.move.c2 === null
@@ -1798,14 +1806,15 @@ export function MatchPage(): JSX.Element {
 
   const persistMoveWithRetry = useCallback(
     async (operation: PendingPersistOperation) => {
-      if (!canPersist || accessToken === null) {
+      const token = lastAccessTokenRef.current;
+      if (!canPersist || token === null) {
         throw new Error("Persistencia no disponible.");
       }
       let lastError: unknown = null;
       for (let attempt = 1; attempt <= PERSIST_MAX_RETRIES; attempt += 1) {
         try {
           await storeManualMove(
-            accessToken,
+            token,
             operation.gameId,
             operation.beforeBoard,
             operation.move,
@@ -1826,7 +1835,7 @@ export function MatchPage(): JSX.Element {
       }
       throw lastError;
     },
-    [accessToken, canPersist],
+    [canPersist],
   );
 
   const disableRemotePersistence = useCallback(
diff --git a/web/src/widgets/layout/AppShell.tsx b/web/src/widgets/layout/AppShell.tsx