Merge pull request #4 from Tesla2000/feature/python-speedup

Tesla2000 · web-flow · commit 2d8e113d2bdb · 2024-04-14T15:38:19.000+02:00
Feature/python speedup
diff --git a/Config.py b/Config.py
@@ -27,7 +27,8 @@ class _ConfigAgent:
     c = 0.2
     learning_rate = 1e-5
     debug = False
-    pretrain = True
+    # pretrain = True
+    pretrain = False
 
 
 class Config(_ConfigPaths, _ConfigAgent):
@@ -38,7 +39,7 @@ class Config(_ConfigPaths, _ConfigAgent):
     train_batch_size = 128
     training_buffer_len = 100_000
     min_n_points_to_finish = 15
-    n_simulations = 100
+    n_simulations = 1000
     n_games = None
     n_players = 2
     n_actions = 45
diff --git a/agent/Agent.py b/agent/Agent.py
@@ -8,7 +8,7 @@
 
 class Agent(nn.Module):
     _input_size_dictionary = {
-        2: 205,
+        2: 211,
     }
 
     def __init__(
diff --git a/agent/policy.py b/agent/policy.py
@@ -18,7 +18,7 @@ def policy(
     P = defaultdict(dict)
     Q = defaultdict(dict)
     initial_state = game.get_state()
-    all_moves = game.get_possible_actions()
+    all_moves = game.all_moves
     for _ in range(n_simulations):
         search(game.copy(), agent, c, N, visited, P, Q)
     pi = np.array([N[initial_state][a] for a in all_moves])
diff --git a/agent/search.py b/agent/search.py
@@ -1,6 +1,7 @@
-from collections import defaultdict
 from math import sqrt
+from collections import defaultdict
 
+import torch
 from torch import nn, Tensor
 
 from src.Game import Game
@@ -20,19 +21,35 @@ def search(
     state = game.get_state()
     if state not in visited:
         visited.add(state)
-        move_scores, v = agent(Tensor([state]))
+        with torch.no_grad():
+            move_scores, v = agent(Tensor([state]))
         tuple(
-            P[state].__setitem__(move, move_scores[0, index])
+            P[state].__setitem__(move, move_scores[0, index].item())
             for index, move in enumerate(game.all_moves)
         )
-        return -v
+        return -v.item()
+    q_state = Q[state]
+    p_state = P[state]
+    n_state = N[state]
+    sqrt_value = sqrt(sum(n_state.values()))
 
+    # def _get_action(game: Game):
+    #     return max(
+    #         game.get_possible_actions(),
+    #         key=lambda action: q_state.get(action, 1) + c * p_state[action] * sqrt_value / (1 + n_state[action]),
+    #     )
+    # def _get_action(game: Game):
+    #     best_action = None
+    #     best_value = -float('inf')
+    #     for action in game.all_moves:
+    #         value = q_state.get(action, 1) + c * p_state[action] * sqrt_value / (1 + n_state[action])
+    #         if value > best_value and action.is_valid(game):
+    #             best_value, best_action = value, action
+    #     return best_action
     action = max(
         game.get_possible_actions(),
-        key=lambda action: Q[state].get(action, 1)
-        + c * P[state][action] * sqrt(sum(N[state].values())) / (1 + N[state][action]),
+        key=lambda action: q_state.get(action, 1) + c * p_state[action] * sqrt_value / (1 + n_state[action]),
     )
-
     next_game_state = game.perform(action)
     v = search(next_game_state, agent, c, N, visited, P, Q)
 
diff --git a/agent/self_play.py b/agent/self_play.py
@@ -32,7 +32,7 @@ def self_play(
 def _perform_game(
     game: Game, states: list, id_to_agent: dict[int, Agent]
 ) -> tuple[list[tuple[np.array, np.array, int]], Agent]:
-    for turn in tqdm(count()):
+    for _ in tqdm(count()):
         agent = id_to_agent[game.current_player.id]
         pi, action = policy(game, agent, Config.c, Config.n_simulations)
         states.append((game, pi / pi.sum(), 0))
@@ -47,7 +47,6 @@ def _perform_game(
                         int(result[state[0].current_player.id] == 1),
                     )
                     for state in states
-                    if state[1] != game.null_move
                 ),
                 id_to_agent[
                     next(player.id for player in game.players if result[player.id])
diff --git a/src/Game.py b/src/Game.py
@@ -155,6 +155,11 @@ def get_possible_actions(self) -> tuple[Move, ...]:
             (self.null_move,) if self.null_move.is_valid(self) else tuple()
         )
 
+    def get_possible_action_indexes(self) -> tuple[int, ...]:
+        return tuple(index for index, move in enumerate(self.all_moves) if move.is_valid(self)) or (
+            (self.null_move,) if self.null_move.is_valid(self) else tuple()
+        )
+
     combos = combinations([{field.name: 1} for field in fields(BasicResources)], 3)
     all_moves = list(
         GrabThreeResource(BasicResources(**res_1, **res_2, **res_3))
diff --git a/src/StateExtractor.py b/src/StateExtractor.py
@@ -48,6 +48,7 @@ def get_state(cls, game: "Game") -> tuple:
                         tuple(iter(aristocrat.cost))
                         for aristocrat in game.board.aristocrats
                     ),
+                    iter(game.board.resources),
                     chain.from_iterable(
                         (
                             *tuple(iter(player.resources)),

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`
`9`	`9`	`class Agent(nn.Module):`
`10`	`10`	`_input_size_dictionary = {`
`11`		`- 2: 205,`
	`11`	`+ 2: 211,`
`12`	`12`	`}`
`13`	`13`
`14`	`14`	`def __init__(`
Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@ def get_state(cls, game: "Game") -> tuple:`
`48`	`48`	`tuple(iter(aristocrat.cost))`
`49`	`49`	`for aristocrat in game.board.aristocrats`
`50`	`50`	`),`
	`51`	`+ iter(game.board.resources),`
`51`	`52`	`chain.from_iterable(`
`52`	`53`	`(`
`53`	`54`	`*tuple(iter(player.resources)),`