From dd7147ed2929a77cdccf936e4052f1cc4ea96079 Mon Sep 17 00:00:00 2001 From: Thomas Miconi Date: Mon, 2 Mar 2026 15:01:23 -0800 Subject: [PATCH 1/2] Grixel (pixel-based Grid) environment Pixel-based version of the ocean/grid environment. Gridworld with pixel inputs. Simple visual-memory experiment (for each episode, the agent must find out, from experience, which of two moving objects is the reward vs. the enemy). See ocean/grixel/README.md for more information. --- pufferlib/config/ocean/grixel.ini | 116 +++ pufferlib/ocean/environment.py | 1 + pufferlib/ocean/grixel/README.md | 36 + pufferlib/ocean/grixel/__init__.py | 0 pufferlib/ocean/grixel/binding.c | 106 +++ pufferlib/ocean/grixel/grixel.c | 95 +++ pufferlib/ocean/grixel/grixel.h | 1218 ++++++++++++++++++++++++++++ pufferlib/ocean/grixel/grixel.py | 101 +++ pufferlib/ocean/torch.py | 464 +++++++++-- 9 files changed, 2060 insertions(+), 77 deletions(-) create mode 100644 pufferlib/config/ocean/grixel.ini create mode 100644 pufferlib/ocean/grixel/README.md create mode 100644 pufferlib/ocean/grixel/__init__.py create mode 100644 pufferlib/ocean/grixel/binding.c create mode 100644 pufferlib/ocean/grixel/grixel.c create mode 100644 pufferlib/ocean/grixel/grixel.h create mode 100644 pufferlib/ocean/grixel/grixel.py diff --git a/pufferlib/config/ocean/grixel.ini b/pufferlib/config/ocean/grixel.ini new file mode 100644 index 0000000000..d1c2604108 --- /dev/null +++ b/pufferlib/config/ocean/grixel.ini @@ -0,0 +1,116 @@ +# REMEMBER DEFAULT.INI + +[base] +package = ocean +env_name = puffer_grixel +#policy_name = Policy +policy_name = Grixel +rnn_name = Recurrent +# Not included in this release: +#rnn_name = RecurrentPlastic +#rnn_name = RecurrentTransformer +test = Test +test2 = Test2 + +[policy] +hidden_size = 512 +#3031 +#512 + +[rnn] +input_size = 512 +#3031 +#512 +hidden_size = 512 +#n_layers = 1 + +[vec] +#num_envs = 8 +num_envs = 1 + +[env] +#max_size = 47 +#max_size = 25 +#max_size = 23 + +#max_size = 13 +#num_envs = 1024 +#num_maps = 8192 + +max_size = 13 +#num_envs = 4096 +num_envs = 1024 +num_maps = 8192 + +# texture_mode governs texture assignment +# to goal/reward vs. zombies +# 0: fixed textures with fixed assignments +# 1: two fixed textures, which one is reward/ +# zombie changes randomly for each episode +# 2: completely random textures each time +# Note: the neutral object always has random +# texture + +texture_mode = 2 + +[train] +# Best params +#total_timesteps = 435_000_000 +#adam_beta1 = 0.9801350114303844 +#adam_beta2 = 0.9931056135397744 +#adam_eps = 6.024885743259763e-8 +#clip_coef = 0.283658795325587 +#ent_coef = 0.007885530106105381 +#gae_lambda = 0.9574676436577135 +#gamma = 0.9961782334639131 +#learning_rate = 0.0007890771333884192 +#max_grad_norm = 2.5271346931510053 +#minibatch_size = 8192 +#prio_alpha = 0.8735470630752789 +#prio_beta0 = 0.6533958384978629 +#vf_clip_coef = 1.9338563232919095 +#vf_coef = 3.915248046963283 +#vtrace_c_clip = 1.018588814067991 +#vtrace_rho_clip = 2.4215244529216466 + +# # New sweep best params +#total_timesteps = 435_000_000 +total_timesteps = 1_000_000_000 +# adam_beta1 = 0.9493079570168755 +# adam_beta2 = 0.9998213228757207 +# adam_eps = 2.16720639574209e-8 +# + +# Should be equal to env->horizon in grixel.h - set_state() +bptt_horizon = 128 +#bptt_horizon = 256 + +# clip_coef = 0.399530686596841 +ent_coef = 0.0017271288609381147 +#ent_coef = 0.01 +#ent_coef = 0.003 +# gae_lambda = 0.9491722822649111 +# gamma = 0.9877360824574745 +# max_grad_norm = 3.016348031602564 +# minibatch_size = 8192 +# prio_alpha = 0.8219794821639037 +# prio_beta0 = 0.9447478232810274 +# vf_clip_coef = 0.6051579400844748 +# vf_coef = 2.323141961227481 +# vtrace_c_clip = 1.2499497264614237 +# vtrace_rho_clip = 4.7398234531013985 +# +# #learning_rate = 0.0012892859713461897 +# # anneal_fr is defined as True in default.ini +learning_rate = 0.001 +anneal_lr = False + +[sweep] +downsample = 0 + +[sweep.train.total_timesteps] +distribution = log_normal +min = 3e8 +max = 6e8 +mean = 3e8 +scale = time diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 6c56a4ea20..db943e41f7 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -148,6 +148,7 @@ def make_multiagent(buf=None, **kwargs): 'trash_pickup': 'TrashPickupEnv', 'tower_climb': 'TowerClimb', 'grid': 'Grid', + 'grixel': 'Grixel', 'shared_pool': 'PyCPR', 'impulse_wars': 'ImpulseWars', 'drive': 'Drive', diff --git a/pufferlib/ocean/grixel/README.md b/pufferlib/ocean/grixel/README.md new file mode 100644 index 0000000000..d2baa87772 --- /dev/null +++ b/pufferlib/ocean/grixel/README.md @@ -0,0 +1,36 @@ +## Grixel environment + +This is a pixel-based version of the "grid" environment, that is, a gridworld with pixel-based inputs (as in Crafter / Craftax). We use the pixel-based inputs, and the large space of visual stimuli they allow, to implement a very simple meta-learning experiment, based on visual memory. + +Each world is a maze (with added gaps at random position to make movement easier, since maze-solving is no the primary purpose of the environment). In addition to the agent, there are two types of moving objects (or "mobs"), namely "rewards" and "zombies". When hitting a mob, the agent receives a reward (positive or negative) and is randomly teleported. Currently all mobs move randomly. + +There is a also a "neutral" type of object, which can be picked and dropped by the agent (picking is simply by moving onto it, dropping is a dedicated action). Currently this has no effect at all. + +The visual input to the agent is a local portion of the pixel map, or size 11 x 11 x block_size x block_size. 11x11 is inherited from the "grid" environment as the visual input diameter over the gridworld, and block_size (default 5) is the number of pixels in the height/width of each block in the grid. + +All objects are represented by binary textures of size block_size x block_size. The exact visual appearance of all objects is governed by the "texture_mode" parameter in the "env" section of the configuration: + +- texture_mode=0: the reward and the zombie each have a fixed, unchanging appearance across episodes +- texture_mode=1: the reward and the zombie randomly swap their appearance for each episode (there are still only two possible appearances in total) +- texture_mode=2: the reward and the zombie have completely random appearance, that is, each of them is assigned a random binary texture for each episode. + +In modes 1 and 2, the agent must learn anew which of the two mobs is the reward or the zombie, from experience. This is the meta-learning aspect of the experiment. + +Crucially, the agent can also perceive previous-step reward as part of its input; this is required for meta-learning. + +The encoder is a CNN where the input layer has both kernel size and stride equal to block_size: the first convolution thus separately maps each block of the gridworld into a single vector. + +The experiment works with the standard LSTM from PufferLib's Recurrent model. We also implemented a transformer and a plastic LSTM, with the plastic LSTM performing best by far in this simple visual memory task. These are not included here as they require modifying the rest of the PufferLib code (though you can see these *highly experimental* implementations [there](https://github.com/ThomasMiconi/PufferLib/blob/grixel/pufferlib/models.py)). + +Notably, all episodes have the same lengths, equal to the backpropagation-through-time horizon of the PPO training loop. This avoids difficulties with changing environments and ensures each episode starts with a reset hidden state during training. + +This code is provided as is. Everything in this code is experimental and none of it has been thoroughly tested. + +To run the training: + +`puffer train puffer_grixel --rnn-name Recurrent --env.texture-mode 2` + +To start a visual eval: + +`puffer eval puffer_grixel --rnn-name Recurrent --load-model-path [checkpoint_file] --env.texture-mode 2` + diff --git a/pufferlib/ocean/grixel/__init__.py b/pufferlib/ocean/grixel/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/pufferlib/ocean/grixel/binding.c b/pufferlib/ocean/grixel/binding.c new file mode 100644 index 0000000000..b6908a17e6 --- /dev/null +++ b/pufferlib/ocean/grixel/binding.c @@ -0,0 +1,106 @@ +#include "grixel.h" + +#define Env Grixel +#define MY_SHARED +#include "../env_binding.h" + +static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) { + + // This is called once at the start of each + // experiment, generating a fixed set of maps, + // from which we will choose randomly for each + // episode + + int num_maps = unpack(kwargs, "num_maps"); + int max_size = unpack(kwargs, "max_size"); + int size = unpack(kwargs, "size"); + + // These are needed because we're using init_grid, which relies on + // them being set + int pixelize = unpack(kwargs, "pixelize"); + int block_size = unpack(kwargs, "block_size"); + + State* levels = calloc(num_maps, sizeof(State)); + + if (max_size <= 5) { + PyErr_SetString(PyExc_ValueError, "max_size must be >5"); + return NULL; + } + + // Temporary env used to gen maps + Grixel env; + env.max_size = max_size; + env.pixelize = pixelize; + env.block_size= block_size; + env.additional_obs_size = unpack(kwargs, "additional_obs_size"); + env.nb_object_types = unpack(kwargs, "nb_object_types"); + + // Hmmm... at that point, block_size and pixelize are not defined, + // even though they're used in init_grid! + init_grid(&env); // This allocates env, with 1 agent and 1 max-size grid + + srand(time(NULL)); + int start_seed = rand(); + for (int i = 0; i < num_maps; i++) { + int sz = size; + if (size == -1) { + int min = 9; + //if (max_size / 2 > min) + // min = max_size/2; + sz = min + (rand() % (max_size-min)); + //sz = 5 + (rand() % (max_size-5)); + //sz = max_size / 2; + } + + if (sz % 2 == 0) { + sz -= 1; + } + + float difficulty = (float)rand()/(float)(RAND_MAX); + create_maze_level(&env, sz, sz, difficulty, start_seed + i); + init_state(&levels[i], max_size, 1); // allocates the grid, with num_agents=1 + get_state(&env, &levels[i]); // this copies from env to levels + // if env and levels have different num_agents strange things might happen? + } + + return PyLong_FromVoidPtr(levels); +} + +static int my_init(Env* env, PyObject* args, PyObject* kwargs) { + env->max_size = unpack(kwargs, "max_size"); + env->num_maps = unpack(kwargs, "num_maps"); + env->texture_mode = unpack(kwargs, "texture_mode"); + env->pixelize = unpack(kwargs, "pixelize"); + env->additional_obs_size = unpack(kwargs, "additional_obs_size"); + env->nb_object_types = unpack(kwargs, "nb_object_types"); + env->block_size= unpack(kwargs, "block_size"); + + init_grid(env); //requires block_size to be pre-set + + env->is_mobile[REWARD] = 1; + env->is_mobile[ZOMBIE] = 1; + env->is_pickable[OBJECT] = 1; + + PyObject* handle_obj = PyDict_GetItemString(kwargs, "state"); + if (!PyObject_TypeCheck(handle_obj, &PyLong_Type)) { + PyErr_SetString(PyExc_TypeError, "state handle must be an integer"); + return 1; + } + + State* levels = (State*)PyLong_AsVoidPtr(handle_obj); + if (!levels) { + PyErr_SetString(PyExc_ValueError, "Invalid state handle"); + return 1; + } + + env->levels = levels; + return 0; +} + +static int my_log(PyObject* dict, Log* log) { + assign_to_dict(dict, "perf", log->perf); + assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); + return 0; +} diff --git a/pufferlib/ocean/grixel/grixel.c b/pufferlib/ocean/grixel/grixel.c new file mode 100644 index 0000000000..5c982d2f76 --- /dev/null +++ b/pufferlib/ocean/grixel/grixel.c @@ -0,0 +1,95 @@ +#include "grixel.h" + +int main() { + int max_size = 16; //32; + int width = 16; //32; + int height = 16; //32; + int num_agents = 1; + int horizon = 128; + float speed = 1; + int vision = 5; + bool discretize = true; + + int render_cell_size = 16; + int seed = 0; + + Grixel* env = allocate_grid(max_size, num_agents, horizon, + vision, speed, discretize); + + //env->width = 32; + //env->height = 32; env->agents[0].spawn_x = 16; + //env->agents[0].spawn_y = 16; + //env->agents[0].color = 6; + //reset(env, seed); + //load_locked_room_preset(env); + + + State* levels = calloc(1, sizeof(State)); + + //create_maze_level(env, 31, 31, 0.85, seed); + //create_maze_level(env, 15, 15, 0.85, seed); + create_maze_level(env, 15, 15, 0.25, seed); + init_state(levels, max_size, num_agents); + get_state(env, levels); + env->num_maps = 1; + env->levels = levels; + //generate_locked_room(env); + //State state; + //init_state(&state, env->max_size, env->num_agents); + //get_state(env, &state); + + /* + width = height = 31; + env->width=31; + env->height=31; + env->agents[0].spawn_x = 1; + env->agents[0].spawn_y = 1; + reset(env, seed); + generate_growing_tree_maze(env->grid, env->width, env->height, max_size, 0.85, 0); + env->grid[(env->height-2)*env->max_size + (env->width - 2)] = GOAL; + */ + + int tick = 0; + c_render(env); + while (!WindowShouldClose()) { + // User can take control of the first agent + env->actions[0] = ATN_FORWARD; + Agent* agent = &env->agents[0]; + + // TODO: Why are up and down flipped? + if (IsKeyDown(KEY_LEFT_SHIFT)) { + if (IsKeyDown(KEY_UP) || IsKeyDown(KEY_W)){ + //env->actions[0] = ATN_FORWARD; + agent->direction = 3.0*PI/2.0; + } else if (IsKeyDown(KEY_DOWN) || IsKeyDown(KEY_S)) { + //env->actions[0] = ATN_BACK; + agent->direction = PI/2.0; + } else if (IsKeyDown(KEY_LEFT) || IsKeyDown(KEY_A)) { + //env->actions[0] = ATN_LEFT; + agent->direction = PI; + } else if (IsKeyDown(KEY_RIGHT) || IsKeyDown(KEY_D)) { + //env->actions[0] = ATN_RIGHT; + agent->direction = 0; + } else { + env->actions[0] = ATN_PASS; + } + } else { + for (int i = 0; i < num_agents; i++) { + env->actions[i] = rand() % 5; + } + } + + //env->actions[0] = actions[t]; + tick = (tick + 1)%12; + bool done = false; + if (tick % 1 == 0) { + c_step(env); + //printf("direction: %f\n", env->agents[0].direction); + + } + c_render(env); + } + free_allocated_grid(env); + return 0; +} + diff --git a/pufferlib/ocean/grixel/grixel.h b/pufferlib/ocean/grixel/grixel.h new file mode 100644 index 0000000000..584c0fca73 --- /dev/null +++ b/pufferlib/ocean/grixel/grixel.h @@ -0,0 +1,1218 @@ +#include +#include +#include +#include +#include +#include +#include +#include "raylib.h" + +#define TWO_PI 2.0*PI +#define MAX_SIZE 40 + +#define ATN_PASS 0 +#define ATN_FORWARD 1 +#define ATN_LEFT 2 +#define ATN_RIGHT 3 +#define ATN_BACK 4 +#define ATN_DROP 5 + +#define DIR_WEST 0.0; +#define DIR_NORTH PI/2.0; +#define DIR_EAST PI; +#define DIR_SOUTH 3.0*PI/2.0; + +#define MAX_MOBS 5 + +#define EMPTY 0 +#define WALL 1 +#define ZOMBIE 2 +#define GOAL 3 +#define REWARD 4 +#define OBJECT 5 +#define AGENT 6 +#define BLOCK 14 + +// Maximum item number is 32 (from pufferlib/ocean/torch.py , one_hot in encode_observations) + +#define LOG_BUFFER_SIZE 4096 + +typedef struct Log Log; +struct Log { + float perf; + float score; + float episode_return; + float episode_length; + float n; +}; + +// 8 unique agents (we only use 1) +bool is_agent(int idx) { + //return idx >= AGENT && idx < AGENT + 8; + return idx == AGENT ; +} + + +/*int rand_color() { + return AGENT + rand()%8; +}*/ + +// 6 unique keys and doors +/*bool is_key(int idx) { + return idx >= KEY && idx < KEY + 6; +} +bool is_locked_door(int idx) { + return idx >= DOOR_LOCKED && idx < DOOR_LOCKED + 6; +} +bool is_open_door(int idx) { + return idx >= DOOR_OPEN && idx <= DOOR_OPEN + 6; +} +bool is_correct_key(int key, int door) { + return key == door - 6; +}*/ + + + +// Every object or NPC (except for wall and +// empty space) is a "Mob". Even those that +// don't move. +typedef struct Mob Mob; +struct Mob{ + float y; + float x; + float prev_y; + float prev_x; + float spawn_y; + float spawn_x; + int color; // determines the type of the mob + int alive; + int mobile; +}; + +typedef struct Agent Agent; +struct Agent { + float y; + float x; + float prev_y; + float prev_x; + float spawn_y; + float spawn_x; + int color; + float direction; + Mob *held; +}; + + +typedef struct Renderer Renderer; +typedef struct State State; + +typedef struct Grixel Grixel; +struct Grixel{ + Renderer* renderer; + State* levels; + int num_maps; + int width; + int height; + int num_agents; + int horizon; + int vision; + int tick; + float speed; + int obs_diameter; + int max_size; + + // the following values are set in the grixel.py init, passed + // as params to vec_init in grixel.py init, and + // read into these fields by my_init in binding.c: + int block_size; + int pixelize; + int texture_mode; + int additional_obs_size; + int nb_object_types; + + bool discretize; + Log log; + Agent* agents; + Mob* mobs; + char choice; + unsigned char* grid; + int* counts; + // block_Textures and observations used to be + // unsigned char, but now they are char because + // they can have negative values + char* block_textures; + // The following arrays are created by the python class (as + // Gym boxes / numpy arrays), and + // bound to the C struct's same-name variables by env_binding.h + // (better hope that they all agree on the same size!) + char* observations; + float* actions; + float* rewards; + unsigned char* terminals; + char* pockets; // if we ever decide to allow agents to hold more than one object at a time... + char* is_mobile; + char* is_pickable; + // For logging, debugging + float* cum_rewards; + float* prev_rewards; +}; + + +// init_grid allocates and initializes the memory required by the environment +// (including a max size grid) +void init_grid(Grixel* env) { + env->num_agents = 1; // should always be 1 + env->vision = 5; // assumed to be 5 in other code + env->speed = 1; + env->discretize = true; + // env->block_size and env->pixelize should be set before calling init_grid, like max_size and num_maps. + env->obs_diameter = 2*env->vision + 1; // this has to be 11 anyway + /*if (env->block_size != 5){ + printf("block size is not 5, but %d \n", env->block_size); + puts("Error!"); + exit(1); + }*/ + if (env->pixelize) + env->obs_diameter *= env->block_size; + int env_mem= env->max_size * env->max_size; + env->grid = calloc(env_mem, sizeof(unsigned char)); + env->counts = calloc(env_mem, sizeof(int)); + env->agents = calloc(env->num_agents, sizeof(Agent)); + env->mobs = calloc(MAX_MOBS, sizeof(Mob)); + env->cum_rewards = calloc(env->num_agents, sizeof(float)); + env->prev_rewards = calloc(env->num_agents, sizeof(float)); + env->pockets = calloc(env->num_agents, sizeof(char) * env->nb_object_types); + env->is_mobile = calloc(env->nb_object_types, sizeof(char)); + env->is_pickable = calloc(env->nb_object_types, sizeof(char)); + // Allocating and initializing textures + // Originally, 32 possible objects + // env->block_textures = calloc(32* env->block_size * env->block_size, sizeof(char)); + env->block_textures = calloc(env->nb_object_types * env->block_size * env->block_size, sizeof(char)); + fill_textures_randomize(env); + +} + +// Apparently the following is only used in the grixel.c test code +// but note it confirms observations as an unsigned char array... +// DO NOT USE, not updated +Grixel* allocate_grid(int max_size, int num_agents, int horizon, + int vision, float speed, bool discretize) { + Grixel* env = (Grixel*)calloc(1, sizeof(Grixel)); + env->max_size = max_size; + env->num_agents = num_agents; + env->horizon = horizon; + env->vision = vision; + env->speed = speed; + env->discretize = discretize; + int obs_diameter = 2*vision + 1; + // Since we're in the grixel.c test code, the python Grixel class + // is not used and we must allocate these arrays ourselves + env->observations = calloc( + num_agents*(obs_diameter*obs_diameter+env->additional_obs_size), sizeof(unsigned char)); + env->actions = calloc(num_agents, sizeof(float)); + env->rewards = calloc(num_agents, sizeof(float)); + env->terminals = calloc(num_agents, sizeof(unsigned char)); + init_grid(env); + return env; +} + +void c_close(Grixel* env) { + free(env->grid); + free(env->counts); + free(env->block_textures); + free(env->agents); + free(env->cum_rewards); + free(env->prev_rewards); + free(env->mobs); + free(env); +} + +void free_allocated_grid(Grixel* env) { + free(env->observations); + free(env->actions); + free(env->rewards); + free(env->terminals); + c_close(env); +} + +bool in_bounds(Grixel* env, int y, int c) { + return (y >= 0 && y <= env->height + && c >= 0 && c <= env->width); +} + +int grid_offset(Grixel* env, int y, int x) { + return y*env->max_size + x; +} + +void add_log(Grixel* env, int idx) { + env->log.perf += env->cum_rewards[idx]; + env->log.score += env->cum_rewards[idx]; + env->log.episode_return += env->cum_rewards[idx]; + env->log.episode_length += env->tick; + env->log.n += 1.0; +} + +void compute_observations(Grixel* env) { + memset(env->observations, 0, (env->obs_diameter*env->obs_diameter+env->additional_obs_size)*env->num_agents); + for (int agent_idx = 0; agent_idx < env->num_agents; agent_idx++) { + Agent* agent = &env->agents[agent_idx]; + float y = agent->y; + float x = agent->x; + int start_r = y - env->vision; + if (start_r < 0) { + start_r = 0; + } + + int start_c = x - env->vision; + if (start_c < 0) { + start_c = 0; + } + + int end_r = y + env->vision; + if (end_r >= env->max_size) { + end_r = env->max_size - 1; + } + + int end_c = x + env->vision; + if (end_c >= env->max_size) { + end_c = env->max_size - 1; + } + + int obs_offset = agent_idx*(env->obs_diameter*env->obs_diameter + env->additional_obs_size); + + if (env->pixelize == 0) + for (int r = start_r; r <= end_r; r++) { + for (int c = start_c; c <= end_c; c++) { + int r_idx = r - y + env->vision; + int c_idx = c - x + env->vision; + int obs_adr = obs_offset + r_idx*env->obs_diameter + c_idx; + int adr = grid_offset(env, r, c); + env->observations[obs_adr] = env->grid[adr]; + } + } + else + { + int cpt=0; + int BS2 = env->block_size * env->block_size; + for (int ii=-env->vision; ii <= env->vision; ii++) + for (int jj=-env->vision; jj <= env->vision; jj++) + { + cpt ++; + // source row and column + int r_s = y + ii; int c_s = x + jj; + if (r_s < 0) + continue; + if (r_s >= env->max_size) + continue; + if (c_s < 0) + continue; + if (c_s >= env->max_size) + continue; + int source_adr = grid_offset(env, r_s, c_s); + //for (int pos=0; posblock_size*env->block_size; pos++) + // env->observations[(cpt-1)*BS2*BS2 + pos]= env->block_textures[env->grid[source_adr]*BS2*BS2+pos]; + // Hopefully the following does the exact same thing: + memcpy(env->observations+obs_offset+(cpt-1)*BS2, env->block_textures+env->grid[source_adr]*BS2, BS2); + } + } + + // additional observations + if (env->additional_obs_size> 0) + { + // agent sees the previous step's reward (need to test more if current step + // reward is worse) + env->observations[obs_offset+env->obs_diameter * env->obs_diameter] = (char)(10.0 * env->prev_rewards[agent_idx]); + // Note that PufferLib silently clips rewards to -1,1 anyway + if (((char)(env->rewards[agent_idx]) > 1) || ((char)(env->rewards[agent_idx])<-1)) + { + printf("OOB value %d\n", (char)env->rewards[agent_idx]); + puts("Exiting"); + exit(1); + } + env->observations[obs_offset+env->obs_diameter * env->obs_diameter +1] = (char)(env->terminals[agent_idx]); + // for debugging + //env->observations[obs_offset+env->obs_diameter * env->obs_diameter +2] = (char)(env->choice); + memcpy(env->observations+obs_offset+env->obs_diameter * env->obs_diameter +3, + env->pockets+agent_idx, env->nb_object_types); + //env->observations[obs_offset+env->obs_diameter * env->obs_diameter +3] = (char)(env->pockets[agent_idx]); + + + } + } +} + +void make_border(Grixel*env) { + for (int r = 0; r < env->height; r++) { + int adr = grid_offset(env, r, 0); + env->grid[adr] = WALL; + adr = grid_offset(env, r, env->width-1); + env->grid[adr] = WALL; + } + for (int c = 0; c < env->width; c++) { + int adr = grid_offset(env, 0, c); + env->grid[adr] = WALL; + adr = grid_offset(env, env->height-1, c); + env->grid[adr] = WALL; + } +} + +void spawn_agent(Grixel* env, int idx, int x, int y) { + Agent* agent = &env->agents[idx]; + int spawn_y = y; + int spawn_x = x; + assert(in_bounds(env, spawn_y, spawn_x)); + int adr = grid_offset(env, spawn_y, spawn_x); + assert(env->grid[adr] == EMPTY); + agent->spawn_y = spawn_y; + agent->spawn_x = spawn_x; + agent->y = agent->spawn_y; + agent->x = agent->spawn_x; + agent->prev_y = agent->y; + agent->prev_x = agent->x; + agent->color = AGENT; // why was that below? + env->grid[adr] = agent->color; + agent->direction = 0; + agent->held = NULL; +} + +void spawn_mob(Grixel* env, int idx, int x, int y, int color) { + Mob* mob = &env->mobs[idx]; + int spawn_y = y; + int spawn_x = x; + assert(in_bounds(env, spawn_y, spawn_x)); + int adr = grid_offset(env, spawn_y, spawn_x); + assert(env->grid[adr] == EMPTY); + mob->spawn_y = spawn_y; + mob->spawn_x = spawn_x; + mob->y = mob->spawn_y; + mob->x = mob->spawn_x; + mob->prev_y = mob->y; + mob->prev_x = mob->x; + mob->alive = 1; // spawning means you're alive + mob->color = color; + mob->mobile = env->is_mobile[mob->color]; + env->grid[adr] = mob->color; +} + +struct State { + int width; + int height; + int num_agents; + Agent* agents; + Mob* mobs; + unsigned char* grid; +}; + +// init_state allocates the memory for a single map, with its agents +void init_state(State* state, int max_size, int num_agents) { + state->agents = calloc(num_agents, sizeof(Agent)); + state->mobs = calloc(MAX_MOBS, sizeof(Mob)); + state->grid = calloc(max_size*max_size, sizeof(unsigned char)); +} + +void free_state(State* state) { + free(state->agents); + free(state->mobs); + free(state->grid); + free(state); +} + +void get_state(Grixel* env, State* state) { + state->width = env->width; + state->height = env->height; + state->num_agents = env->num_agents; + // this seems to assume that (previous) state->num_agents >= env->num_agents ?? + memcpy(state->agents, env->agents, env->num_agents*sizeof(Agent)); + memcpy(state->mobs, env->mobs, MAX_MOBS *sizeof(Mob)); + memcpy(state->grid, env->grid, env->max_size*env->max_size); +} + +// copies the map and agent data from state to env +void set_state(Grixel* env, State* state) { + env->width = state->width; + env->height = state->height; + // env->horizon should be equal to bptt_horizon ! + env->horizon = 128; //2*env->width*env->height; + env->num_agents = state->num_agents; + memcpy(env->agents, state->agents, env->num_agents*sizeof(Agent)); + memcpy(env->mobs, state->mobs, MAX_MOBS * sizeof(Mob)); + memcpy(env->grid, state->grid, env->max_size*env->max_size); +} + +void c_reset(Grixel* env) { + memset(env->grid, 0, env->max_size*env->max_size); + memset(env->counts, 0, env->max_size*env->max_size*sizeof(int)); + memset(env->cum_rewards, 0, env->num_agents*sizeof(float)); + // memset(env->prev_rewards, 0, env->num_agents*sizeof(float)); // not sure about this + env->tick = 0; + fill_textures_randomize(env); + if (env->renderer != NULL) + update_renderer_textures(env); + int idx = rand() % env->num_maps; + + // This one actually initialize most of + // the data (grid, agent(s), etc.) + // Note: levels include agents and mobs + // (initialized to have one agent and + // zero alive mobs, see + // create_maze_level, which is used do + // create levels in binding.c + // make_shared) + set_state(env, &env->levels[idx]); + + int width = env->width; int height = env->height; + int adr, posx, posy; + int cpt = 0; + int putrewardfirst = rand() % 2; + do{ + posx = width/2 + rand() % (width/2); + //posx = 2 + rand() % (width-2); + posy = height/2 + rand() % (height/2); + adr = grid_offset(env, posy, posx); + cpt++; + if (cpt>10000){ + puts("Infinite loop in positioning reward mob"); + exit(1); + } + + } + while (env->grid[adr] != EMPTY); + spawn_mob(env, 0, posx, posy, putrewardfirst?REWARD:ZOMBIE); + do{ + posx = width/2 + rand() % (width/2); + //posx = 2 + rand() % (width-2); + posy = height/2 + rand() % (height/2); + adr = grid_offset(env, posy, posx); + // the joys of debugging in c (assert doesn't work, gdb sucks) + cpt++; + if (cpt>10000){ + puts("Infinite loop in positioning zombie mob"); + for (int numr=0; numrgrid[grid_offset(env, numr, numc)]); + puts(" "); + } + printf("%d %d %d %d\n", posx, posy, width/2, height/2); + exit(1); + } + } + while (env->grid[adr] != EMPTY); + spawn_mob(env, 1, posx, posy, putrewardfirst?ZOMBIE:REWARD); + do{ + posx = width/2 + rand() % (width/2); + posy = height/2 + rand() % (height/2); + //posx = 2 + rand() % (width-5); + //posy = 2 + rand() % (height-5); + adr = grid_offset(env, posy, posx); + cpt++; + if (cpt>10000){ + puts("Infinite loop in positioning zombie mob"); + for (int numr=0; numrgrid[grid_offset(env, numr, numc)]); + puts(" "); + } + printf("%d %d %d %d\n", posx, posy, width/2, height/2); + exit(1); + } + } + while (env->grid[adr] != EMPTY); + spawn_mob(env, 2, posx, posy, OBJECT); + + compute_observations(env); +} + +int move_to(Grixel* env, int agent_idx, float y, float x) { + Agent* agent = &env->agents[agent_idx]; + if (!in_bounds(env, y, x)) { + return 1; + } + + int adr = grid_offset(env, round(y), round(x)); + int dest = env->grid[adr]; + if (dest == WALL) { + return 1; + } else if (dest == REWARD || dest == GOAL || dest == ZOMBIE) { + + // REMINDER: REWARDS ARE CLAMPED TO [-1;1] IN + // PUFFERL.PY ! + + if (dest == ZOMBIE) + env->rewards[agent_idx] = -.7; //-1.0; + else + env->rewards[agent_idx] = 1.0; //1.0; + if (env->renderer != NULL) // i.e. if we're in eval-mode, with display + printf("Reward: %.2f\n", env->rewards[agent_idx]); + env->cum_rewards[agent_idx] +=env->rewards[agent_idx]; + // Teleporting agent to spawn position + /*x = agent->spawn_x; + y = agent->spawn_y; + adr = grid_offset(env, round(y), round(x));*/ + + // Teleporting agent to random position: + int cpt=0; + do{ + x = 1 + rand() % (env->width-2); + y = 1 + rand() % (env->height-2); + adr = grid_offset(env, y, x); + cpt++; + if (cpt>10000){ + puts("Infinite loop in teleporting agent"); + for (int numr=0; numrheight; numr++){ + for (int numc=0; numcheight; numc++) + printf("%d ", env->grid[grid_offset(env, numr, numc)]); + puts(" "); + } + printf("%d %d %d %d\n", x, y, env->width/2, env->height/2); + exit(1); + } + } + while (env->grid[adr] != EMPTY); + + + } + else if (env->grid[adr] != EMPTY) + { + if ((!env->is_pickable[dest]) || (agent->held != NULL)) + return 1; + else + { + // We're picking something + int num_mob = 0; Mob * this_mob; + // Unspawn mob + for (num_mob=0; num_mob < MAX_MOBS; num_mob++) + { + this_mob = &env->mobs[num_mob]; + if (!this_mob->alive) + // Should *not* include not-alive mobs ! + continue; + if ((round(this_mob->x) == round(x)) && (round(this_mob->y) == round(y))) + break; + } + if (num_mob == MAX_MOBS) + { + puts("Error! Couldn't find mob to be picked."); + printf("grid[adr]: %d\n", env->grid[adr]); + printf("x,y (target): %f %f\n",x,y); + printf("mob[2].x,y: %f %f\n", env->mobs[2].x, env->mobs[2].y); + printf("num_mob: %d\n", num_mob); + for (int nm=0; nm < MAX_MOBS; nm++) + { + this_mob = &env->mobs[nm]; + printf("mobs[%d]->x,y: %f %f\n", nm, this_mob->x, this_mob->y); + printf("mobs[%d].x,y: %f %f\n", nm, env->mobs[nm].x, env->mobs[nm].y); + } + exit(1); + } + if (env->renderer != NULL) // i.e. if we're in eval-mode, with display + printf("Picking, type %d\n", this_mob->color); + agent->held = this_mob; + this_mob->alive = 0; + env->grid[adr] = EMPTY; + // We don't need to move the agent here, this is done below + } + } + + + int start_y = round(agent->y); + int start_x = round(agent->x); + int start_adr = grid_offset(env, start_y, start_x); + env->grid[start_adr] = EMPTY; + + env->grid[adr] = agent->color; + agent->y = y; + agent->x = x; + return 0; +} + +bool step_mob(Grixel* env, int idx) { + Mob* mob = &env->mobs[idx]; + if (!env->is_mobile[mob->color]) + { + puts("Error! Trying to step a non-mobile mob"); + exit(1); + } + mob->prev_y = mob->y; + mob->prev_x = mob->x; + + float x = mob->x; + float y = mob->y; + float dx = -1.0 + rand() % 3; + float dy = -1.0 + rand() % 3; + float dest_x = x + dx; + float dest_y = y + dy; + if (dest_x < 2 && dest_y < 2) { + return false; + } + if (!in_bounds(env, dest_y, dest_x)) { + return false; + } + //int err = move_to(env, idx, dest_y, dest_x); + int adr = grid_offset(env, round(dest_y), round(dest_x)); + if (env->grid[adr] != EMPTY) { + return false; + } + int start_y = round(mob->y); + int start_x = round(mob->x); + int start_adr = grid_offset(env, start_y, start_x); + assert(env->grid[start_adr] == mob->color); + env->grid[start_adr] = EMPTY; + env->grid[adr] = mob->color; + mob->y = dest_y; + mob->x = dest_x; + return true; // we successfully moved +} + + + +bool step_agent(Grixel* env, int idx) { + Agent* agent = &env->agents[idx]; + agent->prev_y = agent->y; + agent->prev_x = agent->x; + + float atn = env->actions[idx]; + int iatn = (int)atn; + float direction = agent->direction; + + if (iatn == ATN_PASS) { + return true; + } else if (iatn == ATN_DROP) { + if (agent->held == NULL) + return true; + // else... nothing, just like for Forward! Actual dropping is dealt with below. + + } else if (iatn == ATN_FORWARD) { + } else if (iatn == ATN_LEFT) { + direction -= PI/2.0; + } else if (iatn == ATN_RIGHT) { + direction += PI/2.0; + } else if (iatn == ATN_BACK) { + direction += PI; + } else { + printf("Invalid action: %f\n", atn); + exit(1); + } + if (direction < 0) { + direction += TWO_PI; + } else if (direction >= TWO_PI) { + direction -= TWO_PI; + } + + float x = agent->x; + float y = agent->y; + float dx = env->speed*cos(direction); + float dy = env->speed*sin(direction); + agent->direction = direction; + float dest_x = x + dx; + float dest_y = y + dy; + if (!in_bounds(env, dest_y, dest_x)) { + return false; + } + + if (iatn == ATN_DROP) + { + // We're dropping + // need to realive the held mob, and spawn it at dest_x,dest_y + assert( agent->held != NULL); + int adr = grid_offset(env, round(dest_y), round(dest_x)); + if (env->grid[adr] != EMPTY) + // should check if killable (bumpable?), give approriate reward, unalive killed mob, delete grid[adr] + return false; + agent->held->x = dest_x; + agent->held->y = dest_y; + agent->held->alive = 1; + env->grid[adr] = agent->held->color; + agent->held = NULL; + if (env->renderer != NULL) // i.e. if we're in eval-mode, with display + printf("Dropping, type %d\n", env->grid[adr]); + return true; + } + else{ + assert (iatn == ATN_FORWARD); + int err = move_to(env, idx, dest_y, dest_x); + if (err) { + return false; + } + } + + int x_int = agent->x; + int y_int = agent->y; + int adr = grid_offset(env, y_int, x_int); + env->counts[adr]++; + //env->rewards[idx] += 0.01 / (float)env->counts[adr]; + //env->log.episode_return += 0.01 / (float)env->counts[adr]; + return true; +} + +void c_step(Grixel* env) { + memset(env->terminals, 0, env->num_agents); + memset(env->rewards, 0, env->num_agents*sizeof(float)); + env->tick++; + + for (int i = 0; i < env->num_agents; i++) { + step_agent(env, i); + } + for (int i = 0; i < MAX_MOBS; i++) { + if (env->mobs[i].alive) + if (env->mobs[i].mobile) + step_mob(env, i); + } + // Note: compute_observations occurs *after* step_agent, + // as it should. + // Rewards must be computed *before* compute_observations + compute_observations(env); + + bool done = true; + for (int i = 0; i < env->num_agents; i++) { + if (!env->terminals[i]) { + done = false; + break; + } + } + + // prev_rewards are updated after everything + memcpy(env->prev_rewards, env->rewards, env->num_agents*sizeof(float)); + + if (env->tick >= env->horizon) { + done = true; + //add_log(env, 0); + for (int i=0; i < env->num_agents; i++){ + env->terminals[i] = 1; // Truncations not fully implemented + add_log(env, i); + } + } + + if (done) { + c_reset(env); + // This is already done in c_reset! + //int idx = rand() % env->num_maps; + //set_state(env, &env->levels[idx]); + //compute_observations(env); + } +} + +// Raylib client +Color COLORS[] = { + (Color){6, 24, 24, 255}, + (Color){0, 0, 255, 255}, + (Color){0, 128, 255, 255}, + (Color){128, 128, 128, 255}, + (Color){255, 0, 0, 255}, + (Color){255, 255, 255, 255}, + (Color){255, 85, 85, 255}, + (Color){170, 170, 170, 255}, + (Color){0, 255, 255, 255}, + (Color){255, 255, 0, 255}, +}; + +Rectangle UV_COORDS[7] = { + (Rectangle){0, 0, 0, 0}, + (Rectangle){512, 0, 128, 128}, + (Rectangle){0, 0, 0, 0}, + (Rectangle){0, 0, 128, 128}, + (Rectangle){128, 0, 128, 128}, + (Rectangle){256, 0, 128, 128}, + (Rectangle){384, 0, 128, 128}, +}; + +struct Renderer { + int cell_size; + int nb_object_types; + int width; + int height; + Texture2D puffer; + Texture2D *renderer_textures; // different from env->block_textures (i.e. larger pixel size) + float* overlay; +}; + +//Renderer* init_renderer(int cell_size, int width, int height) { +Renderer* init_renderer(Grixel *env, int width, int height) { + Renderer* renderer = (Renderer*)calloc(1, sizeof(Renderer)); + renderer->width = width; + renderer->height = height; + renderer->nb_object_types = env->nb_object_types; + + int pixel_size = 3; // how big each individual pixel will be shown on screen + int cell_size = pixel_size * env->block_size; + renderer->cell_size = cell_size; + + renderer->overlay = (float*)calloc(width*height, sizeof(float)); + + InitWindow(width*cell_size, height*cell_size, "PufferLib Grixel"); + SetTargetFPS(60); + + renderer->puffer = LoadTexture("resources/shared/puffers_128.png"); + + renderer->renderer_textures = calloc(env->nb_object_types , sizeof(Texture2D)); + + for (int numt=0; numtnb_object_types; numt++){ + //Image image = GenImageColor(env->blocksize, env->blocksize, BLACK); + Image image = GenImageColor(cell_size, cell_size, BLACK); + for (int numc=0; numc< env->block_size; numc ++) + for (int numr=0; numr< env->block_size; numr ++) + if (env->block_textures[numt*env->block_size*env->block_size + numr*env->block_size + numc] >0) + for (int p1=0; p1 < pixel_size; p1++) + for (int p2=0; p2 < pixel_size; p2++) + ImageDrawPixel(&image, numc*pixel_size+p1, numr*pixel_size+p2, WHITE); + else + for (int p1=0; p1 < pixel_size; p1++) + for (int p2=0; p2 < pixel_size; p2++) + ImageDrawPixel(&image, numc*pixel_size+p1, numr*pixel_size+p2, BLACK); //BLUE); + renderer->renderer_textures[numt] = LoadTextureFromImage(image); + UnloadImage(image); + } + + return renderer; +} + +void clear_overlay(Renderer* renderer) { + memset(renderer->overlay, 0, renderer->width*renderer->height*sizeof(float)); +} + +void close_renderer(Renderer* renderer) { + CloseWindow(); + free(renderer->overlay); + UnloadTexture(renderer->puffer); // let's see what happens + for (int numt=0; numtnb_object_types; numt++) + UnloadTexture(renderer->renderer_textures[numt]); + // no free(renderer->puffer) ? or UnloadTexture? + free(renderer); + +} + +void c_render(Grixel* env) { + // TODO: fractional rendering + float frac = 0.0; + float overlay = 0.0; + if (env->renderer == NULL) { + //env->renderer = init_renderer(16, env->max_size, env->max_size); // Renderer* init_renderer(int cell_size, int width, int height) + env->renderer = init_renderer(env, env->max_size, env->max_size); // Renderer should compute its own cell size + } + Renderer* renderer = env->renderer; + + if (IsKeyDown(KEY_ESCAPE)) { + exit(0); + } + + Agent* agent = &env->agents[0]; + int r = agent->y; + int c = agent->x; + int adr = grid_offset(env, r, c); + //renderer->overlay[adr] = overlay; + //renderer->overlay[adr] -= 0.1; + //renderer->overlay[adr] = -1 + 1.0/(float)env->counts[adr]; + + BeginDrawing(); + ClearBackground((Color){6, 24, 24, 255}); + + int ts = renderer->cell_size; + for (int r = 0; r < env->height; r++) { + for (int c = 0; c < env->width; c++){ + adr = grid_offset(env, r, c); + int tile = env->grid[adr]; + if (tile == EMPTY) { + continue; + // So, the rest of this block is ignored?... + overlay = renderer->overlay[adr]; + if (overlay == 0) { + continue; + } + Color color; + if (overlay < 0) { + overlay = -fmaxf(-1.0, overlay); + color = (Color){255.0*overlay, 0, 0, 255}; + } else { + overlay = fminf(1.0, overlay); + color = (Color){0, 255.0*overlay, 0, 255}; + } + DrawRectangle(c*ts, r*ts, ts, ts, color); + } + + /* + Color color; + if (tile == WALL) { + color = (Color){128, 128, 128, 255}; + } else if (tile == GOAL) { + color = GREEN; + } else if (is_locked_door(tile)) { + int weight = 40*(tile - DOOR_LOCKED); + color = (Color){weight, 0, 0, 255}; + } else if (is_open_door(tile)) { + int weight = 40*(tile - DOOR_OPEN); + color = (Color){0, weight, 0, 255}; + } else if (is_key(tile)) { + int weight = 40*(tile - KEY); + color = (Color){0, 0, weight, 255}; + } else { + continue; + } + DrawRectangle(c*ts, r*ts, ts, ts, color); + */ + if (tile != AGENT) // Agent is fish! + DrawTexture(renderer->renderer_textures[tile], c*ts, r*ts, WHITE); + } + } + + for (int i = 0; i < env->num_agents; i++) { + agent = &env->agents[0]; + float y = agent->y + (frac - 1)*(agent->y - agent->prev_y); + float x = agent->x + (frac - 1)*(agent->x - agent->prev_x); + int u = 0; + int v = 0; + Rectangle source_rect = (Rectangle){u, v, 128, 128}; + Rectangle dest_rect = (Rectangle){x*ts, y*ts, ts, ts}; + DrawTexturePro(renderer->puffer, source_rect, dest_rect, + (Vector2){0, 0}, 0, WHITE); + } + + EndDrawing(); +} + + +void generate_growing_tree_maze(unsigned char* grid, + int width, int height, int max_size, float difficulty, int seed) { + srand(seed); + int dx[4] = {-1, 0, 1, 0}; + int dy[4] = {0, 1, 0, -1}; + int dirs[4] = {0, 1, 2, 3}; + int cells[2*width*height]; + int num_cells = 1; + + bool visited[width*height]; + memset(visited, false, width*height); + + memset(grid, WALL, max_size*height); + for (int r = 0; r < height; r++) { + for (int c = 0; c < width; c++) { + int adr = r*max_size + c; + if (r % 2 == 1 && c % 2 == 1) { + grid[adr] = EMPTY; + } + } + } + + int x_init = rand() % (width - 1); + int y_init = rand() % (height - 1); + + if (x_init % 2 == 0) { + x_init++; + } + if (y_init % 2 == 0) { + y_init++; + } + + int adr = y_init*height + x_init; + visited[adr] = true; + cells[0] = x_init; + cells[1] = y_init; + + //int cell = 32; + //InitWindow(width*cell, height*cell, "PufferLib Ray Grixel"); + //SetTargetFPS(60); + + while (num_cells > 0) { + if (rand() % 1000 > 1000*difficulty) { + int i = rand() % num_cells; + int tmp_x = cells[2*num_cells - 2]; + int tmp_y = cells[2*num_cells - 1]; + cells[2*num_cells - 2] = cells[2*i]; + cells[2*num_cells - 1] = cells[2*i + 1]; + cells[2*i] = tmp_x; + cells[2*i + 1] = tmp_y; + + } + + int x = cells[2*num_cells - 2]; + int y = cells[2*num_cells - 1]; + + int nx, ny; + + // In-place direction shuffle + for (int i = 0; i < 4; i++) { + int ii = i + rand() % (4 - i); + int tmp = dirs[i]; + dirs[i] = dirs[ii]; + dirs[ii] = tmp; + } + + bool made_path = false; + for (int dir_i = 0; dir_i < 4; dir_i++) { + int dir = dirs[dir_i]; + nx = x + 2*dx[dir]; + ny = y + 2*dy[dir]; + + if (nx <= 0 || nx >= width-1 || ny <= 0 || ny >= height-1) { + continue; + } + + int visit_adr = ny*width + nx; + if (visited[visit_adr]) { + continue; + } + + visited[visit_adr] = true; + cells[2*num_cells] = nx; + cells[2*num_cells + 1] = ny; + + nx = x + dx[dir]; + ny = y + dy[dir]; + + int adr = ny*max_size + nx; + grid[adr] = EMPTY; + num_cells++; + + made_path = true; + + /* + if (IsKeyPressed(KEY_ESCAPE)) { + exit(0); + } + BeginDrawing(); + ClearBackground((Color){6, 24, 24, 255}); + Color color = (Color){128, 128, 128, 255}; + for (int r = 0; r < height; r++) { + for (int c = 0; c < width; c++){ + int adr = r*max_size + c; + int tile = grid[adr]; + if (tile == WALL) { + DrawRectangle(c*cell, r*cell, cell, cell, color); + } + } + } + EndDrawing(); + */ + + break; + } + if (!made_path) { + num_cells--; + } + } +} + +// Map creation / filling. +// This is called by my_shared in binding.c, which generates +// the shared set of pre-generated maps to be used by all +// environments +void create_maze_level(Grixel* env, int width, int height, float difficulty, int seed) { + env->width = width; + env->height = height; + memset(env->grid, EMPTY, env->max_size*env->max_size); + generate_growing_tree_maze(env->grid, width, height, env->max_size, difficulty, seed); + int posx, posy, adr; + for (int myc=0; myc < width; myc++) + for (int myr=0; myr < height; myr++) + if (rand() % 2 == 0) { + adr = grid_offset(env, myr, myc); + env->grid[adr] = EMPTY; + } + make_border(env); + for (int m=0; m < MAX_MOBS; m++){ + env->mobs[m].alive=0; + } + spawn_agent(env, 0, 1, 1); + // We spawn the agent, but we do not spawn + // the mobs - we do that at c_reset, rather + // than here at map creation (so the mob + // positions are randomized at each reset) + + //int goal_adr = grid_offset(env, env->height - 2, env->width - 2); + //env->grid[goal_adr] = GOAL; +} + + +void fill_textures_randomize(Grixel* env) +{ + int choice; + int cpt=0; + choice = 1 - rand() % 2; + //choice = 0; + //env->choice = choice; + if (env->renderer != NULL){ // are we in the eval / visualized env? + printf("Texture assignment - Choice is %d \n", choice); + /*if (choice == 0) + printf("reward is cross, zombie is checkers\n"); + else + printf("reward is checkers, zombie is cross\n");*/ + } + + + //for (int numt=0; numt<32; numt++) + for (int numt=0; numtnb_object_types; numt++) + for (int numr=0; numr < env->block_size; numr++) + for (int numc=0; numc < env->block_size; numc++) + { + int value = 0; + if (numt==0) + value=0; + else if (numt==1){ + value = 1; + /*if (choice == 0) + if (numc==3) + if (numr==3) + value=0;*/ + } + else if (is_agent(numt)) + value = (numc==numr); // the agent perceives itself as a diagonal slash + else if (numt == GOAL || numt == REWARD) { + if (env->texture_mode == 0) + value = (numr+numc+1) % 2; + else if (env->texture_mode == 1){ + if (choice == 0) + value = (numr == env->block_size/2 || numc == env->block_size/2) ? 1: 0; + else + value = (numr+numc+1) % 2; + } + else + value = rand() % 2; + //value = (numr+numc+1) % 2; + } + else if (numt == ZOMBIE) { + if (env->texture_mode == 0) + value = (numr == env->block_size/ 2 || numc == env->block_size/2) ? 1: 0; + else if (env->texture_mode == 1){ + if (choice == 1) + value = (numr == env->block_size/2 || numc == env->block_size/2) ? 1: 0; + else + value = (numr+numc+1) % 2; + } + else + value = rand() % 2; + } + else + value = rand() % 2; + env->block_textures[cpt++] = (char) value; + } + if (env->renderer != NULL){ // are we in the eval / visualized env? + puts("Texture assignment done"); + /*if (choice == 0) + printf("reward is cross, zombie is checkers\n"); + else + printf("reward is checkers, zombie is cross\n");*/ + } +} + +void update_renderer_textures(Grixel *env){ + + Renderer *renderer = env->renderer; + int cell_size = env->renderer->cell_size; + int pixel_size = cell_size / env->block_size; // how big each individual pixel will be shown on screen + for (int numt=0; numtnb_object_types; numt++){ + if (numt != 0 && numt != 1 && numt != REWARD && numt != GOAL && numt != ZOMBIE && !(is_agent(numt))) + continue; + //Image image = GenImageColor(env->blocksize, env->blocksize, BLACK); + Image image = GenImageColor(cell_size, cell_size, BLACK); + for (int numc=0; numc< env->block_size; numc ++) + for (int numr=0; numr< env->block_size; numr ++) + if (env->block_textures[numt*env->block_size*env->block_size + numr*env->block_size + numc] >0) + for (int p1=0; p1 < pixel_size; p1++) + for (int p2=0; p2 < pixel_size; p2++) + ImageDrawPixel(&image, numc*pixel_size+p1, numr*pixel_size+p2, WHITE); + else + for (int p1=0; p1 < pixel_size; p1++) + for (int p2=0; p2 < pixel_size; p2++) + ImageDrawPixel(&image, numc*pixel_size+p1, numr*pixel_size+p2, BLACK); //BLUE); + renderer->renderer_textures[numt] = LoadTextureFromImage(image); + UnloadImage(image); + } +} diff --git a/pufferlib/ocean/grixel/grixel.py b/pufferlib/ocean/grixel/grixel.py new file mode 100644 index 0000000000..15ecce06aa --- /dev/null +++ b/pufferlib/ocean/grixel/grixel.py @@ -0,0 +1,101 @@ +import numpy as np +import os + +import gymnasium + +import pdb + +import pufferlib +from pufferlib.ocean.grixel import binding + +class Grixel(pufferlib.PufferEnv): + def __init__(self, render_mode='raylib', vision_range=5, + num_envs=4096, num_maps=1000, map_size=-1, max_size=9, + texture_mode=0, + report_interval=128, buf=None, seed=0): + assert map_size <= max_size + + + self.texture_mode = texture_mode + self.pixelize = 1 + self.block_size = 5 + self.nb_object_types = 7 # determines pocket size for observations + + # vision_range better be 5 + self.obs_diameter = 2*vision_range + 1 + if self.pixelize>0: + self.obs_diameter *= self.block_size + self.additional_obs_size = 3 + self.nb_object_types # reward, reset, extra + pocket size + + self.single_observation_space = gymnasium.spaces.Box(low=-100, high=100, + shape=(self.obs_diameter*self.obs_diameter + self.additional_obs_size,), dtype=np.int8) + #self.single_action_space = gymnasium.spaces.Discrete(5) + # pass, forward, turn left, turn right, turn back, drop (see top of grixel.h) + self.single_action_space = gymnasium.spaces.Discrete(6) + self.render_mode = render_mode + self.num_agents = num_envs + self.report_interval = report_interval + + # creates the buffers unless passed as args, + # also probably reads the .ini file + super().__init__(buf=buf) + + + # It's annoying that we have to pass env parameters to both binding.shared + # and vec_init... but what's the alternative? Each of them independently + # creates a (C) Grixel env (from grixel.h) and uses it for init_grid.... + self.float_actions = np.zeros_like(self.actions).astype(np.float32) + self.c_state = binding.shared(num_maps=num_maps, max_size=max_size, size=map_size, + pixelize=self.pixelize, block_size=self.block_size, + additional_obs_size=self.additional_obs_size, + nb_object_types=self.nb_object_types) + + + self.c_envs = binding.vec_init(self.observations, self.float_actions, + self.rewards, self.terminals, self.truncations, num_envs, seed, + state=self.c_state, max_size=max_size, num_maps=num_maps, pixelize=self.pixelize, block_size=self.block_size, additional_obs_size=self.additional_obs_size, + texture_mode=self.texture_mode, + nb_object_types=self.nb_object_types + ) + pass + + def reset(self, seed=None): + self.tick = 0 + binding.vec_reset(self.c_envs, seed) + return self.observations, [] + + def step(self, actions): + self.float_actions[:] = actions + binding.vec_step(self.c_envs) + + info = [] + if self.tick % self.report_interval == 0: + info.append(binding.vec_log(self.c_envs)) + + self.tick += 1 + return (self.observations, self.rewards, + self.terminals, self.truncations, info) + + def render(self, overlay=0): + binding.vec_render(self.c_envs, overlay) + + def close(self): + pass + #binding.vec_close(self.c_envs) + +def test_performance(timeout=10, atn_cache=1024): + #env = CGrid(num_envs=1000) + env = Grixel(num_envs=1000) + env.reset() + tick = 0 + + actions = np.random.randint(0, 2, (atn_cache, env.num_envs)) + + import time + start = time.time() + while time.time() - start < timeout: + atn = actions[tick % atn_cache] + env.step(atn) + tick += 1 + + print(f'SPS: %f', env.num_envs * tick / (time.time() - start)) diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index c7663c5f5b..fa32944ca9 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -1,6 +1,10 @@ from types import SimpleNamespace from typing import Any, Tuple + +import pdb + + from gymnasium import spaces from torch import nn @@ -15,6 +19,8 @@ from pufferlib.models import Default as Policy from pufferlib.models import Convolutional as Conv Recurrent = pufferlib.models.LSTMWrapper +#RecurrentPlastic = pufferlib.models.LSTMWrapperPlastic +#RecurrentTransformer = pufferlib.models.LSTMTransformerWrapper from pufferlib.pytorch import layer_init, _nativize_dtype, nativize_tensor import numpy as np @@ -251,6 +257,289 @@ def encode_observations(self, observations, state=None): super().encode_observations(observations, state) ''' +class Grixel_previous(nn.Module): + def __init__(self, env, cnn_channels=32, hidden_size=128, **kwargs): + super().__init__() + self.hidden_size = hidden_size + + + self.is_pixelized = env.pixelize; + self.block_size= env.block_size; + self.obs_diameter = env.obs_diameter; # hard-coded to 11 (* block_size if pixelized) in various places of the code + self.additional_obs_size = env.additional_obs_size; + + if self.is_pixelized: + self.network = nn.Sequential( + pufferlib.pytorch.layer_init( + nn.Conv2d(1, cnn_channels, self.block_size, stride=self.block_size)), + # output should now have shape 11 x 11 x cnn_channels - should be independent of block_size + nn.LeakyReLU(), + pufferlib.pytorch.layer_init( + nn.Conv2d(cnn_channels, cnn_channels, 3, stride=2)), + # output should now have shape 5 x 5 x cnn_channels + pufferlib.pytorch.layer_init( + nn.Conv2d(cnn_channels, cnn_channels, 3, stride=2)), + # output should now have shape 2 x 2 x cnn_channels + nn.Flatten(), + nn.LeakyReLU(), + pufferlib.pytorch.layer_init(nn.Linear(4 * cnn_channels, hidden_size)), + nn.LeakyReLU(), + ) + else: + self.network = nn.Sequential( + pufferlib.pytorch.layer_init( + nn.Conv2d(32, cnn_channels, 5, stride=3)), + nn.ReLU(), + pufferlib.pytorch.layer_init( + nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), + nn.Flatten(), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Linear(cnn_channels, hidden_size)), + nn.ReLU(), + ) + + + self.additional_network = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(self.additional_obs_size, hidden_size//2)), + #pufferlib.pytorch.layer_init(nn.Linear(self.additional_obs_size, 16)), + #pufferlib.pytorch.layer_init(nn.Linear(self.additional_obs_size, hidden_size)), + nn.LeakyReLU(), + ) + self.mixer_network= nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(hidden_size + hidden_size//2, hidden_size)), + #pufferlib.pytorch.layer_init(nn.Linear(hidden_size + 16, hidden_size)), + #nn.Linear(self.additional_obs_size, hidden_size), + nn.LeakyReLU(), + pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), + #nn.Linear(self.additional_obs_size, hidden_size), + nn.LeakyReLU(), + ) + + self.is_continuous = isinstance(env.single_action_space, pufferlib.spaces.Box) + if self.is_continuous: + self.decoder_mean = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01) + self.decoder_logstd = nn.Parameter(torch.zeros( + 1, env.single_action_space.shape[0])) + else: + num_actions = env.single_action_space.n + self.actor = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, num_actions), std=0.01) + + self.value_fn = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, 1), std=1) + + def forward(self, observations, state=None): + hidden = self.encode_observations(observations) + actions, value = self.decode_actions(hidden) + return actions, value + + def forward_train(self, x, state=None): + return self.forward(x, state) + def forward_eval(self, x, state=None): + return self.forward(x, state) + + def encode_observations(self, observations, state=None): + if self.is_pixelized: + # Here the visual observation has shape 1024 x (3025+add_obs_size) + # (3025 = 55*55) + # Magic incantation that turns a linear sequence of texture data + # into a square map of square blocks: + # Note: Batch size is variable over time, sometimes 1024, sometimes 8K... + hidden = observations[:,:-self.additional_obs_size].reshape((-1, 11, 11, self.block_size, self.block_size)).permute((0, 1,3,2,4)).reshape((-1, 11*self.block_size, 11*self.block_size)).view((-1,1,11*self.block_size, 11*self.block_size)).float() + #hidden = torch.zeros((observations.shape[0],1,55,55), device='cuda') + if not( torch.all(hidden>=0) and torch.all(hidden<100)): + print(">>> ERROR >>>> Out-of-range visual observations") + pdb.set_trace() + #debug, use if the observation "pixels" are actually all set to tile number to get symbolic inputs: + #hidden = hidden[:, 0, ::5, ::5].long() + #hidden = F.one_hot(hidden, 32).permute(0, 3, 1, 2).float() + else: + hidden = observations.view(-1, 11, 11).long() + hidden = F.one_hot(hidden, 32).permute(0, 3, 1, 2).float() + #debugging + #out = hidden + #for part in self.network.children(): + # out=part(out) + # print(part, "Output shape:", out.shape) + #pdb.set_trace() + + hidden = self.mixer_network(torch.cat((self.network(hidden), self.additional_network(observations[:,-self.additional_obs_size:].float())) , dim=1)) + #hidden = self.network(hidden) + self.additional_network(observations[:,-self.additional_obs_size:].float()) + + #hidden = torch.zeros((observations.shape[0], 512), device='cuda') + + return hidden + + def decode_actions(self, flat_hidden, state=None): + value = self.value_fn(flat_hidden) + if self.is_continuous: + mean = self.decoder_mean(flat_hidden) + logstd = self.decoder_logstd.expand_as(mean) + std = torch.exp(logstd) + probs = torch.distributions.Normal(mean, std) + batch = flat_hidden.shape[0] + return probs, value + else: + action = self.actor(flat_hidden) + return action, value + + +class Grixel(nn.Module): + def __init__(self, env, cnn_channels=32, hidden_size=128, **kwargs): + super().__init__() + self.hidden_size = hidden_size + + + self.is_pixelized = env.pixelize; + self.block_size= env.block_size; + self.obs_diameter = env.obs_diameter; # hard-coded to 11 (* block_size if pixelized) in various places of the code + self.additional_obs_size = env.additional_obs_size; + + if self.is_pixelized: + self.network = nn.Sequential( + # Pure MLP + #nn.Flatten(), + #pufferlib.pytorch.layer_init( + # nn.Linear(55*55, hidden_size)), + #nn.LeakyReLU(), + #pufferlib.pytorch.layer_init( + # nn.Linear(hidden_size, hidden_size)), + #nn.LeakyReLU(), + #pufferlib.pytorch.layer_init( + # nn.Linear(hidden_size, hidden_size)), + #nn.LeakyReLU(), + + + # Convolutional, increasing NumChannels + pufferlib.pytorch.layer_init( + nn.Conv2d(1, cnn_channels, self.block_size, stride=self.block_size)), + # output should now have shape 11 x 11 x cnn_channels - should be independent of block_size + nn.LeakyReLU(), + pufferlib.pytorch.layer_init( + nn.Conv2d(cnn_channels, 2*cnn_channels, 3, stride=2)), + # output should now have shape 5 x 5 x 2*cnn_channels + pufferlib.pytorch.layer_init( + nn.Conv2d(2*cnn_channels, 4*cnn_channels, 3, stride=2)), + # output should now have shape 2 x 2 x 4*cnn_channels + nn.Flatten(), + nn.LeakyReLU(), + pufferlib.pytorch.layer_init(nn.Linear(4 * 4 * cnn_channels, hidden_size)), + nn.LeakyReLU(), + + + ## Convolutional, short (no bottleneck, no channel expansion) + #pufferlib.pytorch.layer_init( + # nn.Conv2d(1, cnn_channels, self.block_size, stride=self.block_size)), + ## output should now have shape 11 x 11 x cnn_channels - should be independent of block_size + #nn.LeakyReLU(), + #pufferlib.pytorch.layer_init( + # nn.Conv2d(cnn_channels, cnn_channels, 3, stride=2)), + ## output should now have shape 5 x 5 x cnn_channels + #nn.Flatten(), + #nn.LeakyReLU(), + #pufferlib.pytorch.layer_init(nn.Linear(5 * 5 * cnn_channels, hidden_size)), + #nn.LeakyReLU(), + ) + else: + self.network = nn.Sequential( + pufferlib.pytorch.layer_init( + nn.Conv2d(32, cnn_channels, 5, stride=3)), + nn.ReLU(), + pufferlib.pytorch.layer_init( + nn.Conv2d(cnn_channels, cnn_channels, 3, stride=1)), + nn.Flatten(), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Linear(cnn_channels, hidden_size)), + nn.ReLU(), + ) + + + self.additional_network = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(self.additional_obs_size, hidden_size//2)), + #pufferlib.pytorch.layer_init(nn.Linear(self.additional_obs_size, 16)), + #pufferlib.pytorch.layer_init(nn.Linear(self.additional_obs_size, hidden_size)), + nn.LeakyReLU(), + ) + self.mixer_network= nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(hidden_size + hidden_size//2, hidden_size)), + #pufferlib.pytorch.layer_init(nn.Linear(hidden_size + 16, hidden_size)), + #nn.Linear(self.additional_obs_size, hidden_size), + nn.LeakyReLU(), + pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), + #nn.Linear(self.additional_obs_size, hidden_size), + nn.LeakyReLU(), + ) + + self.is_continuous = isinstance(env.single_action_space, pufferlib.spaces.Box) + if self.is_continuous: + self.decoder_mean = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01) + self.decoder_logstd = nn.Parameter(torch.zeros( + 1, env.single_action_space.shape[0])) + else: + num_actions = env.single_action_space.n + self.actor = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, num_actions), std=0.01) + + self.value_fn = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, 1), std=1) + + def forward(self, observations, state=None): + hidden = self.encode_observations(observations) + actions, value = self.decode_actions(hidden) + return actions, value + + def forward_train(self, x, state=None): + return self.forward(x, state) + def forward_eval(self, x, state=None): + return self.forward(x, state) + + def encode_observations(self, observations, state=None): + if self.is_pixelized: + # Here the visual observation has shape 1024 x (3025+add_obs_size) + # (3025 = 55*55) + # Magic incantation that turns a linear sequence of texture data + # into a square map of square blocks: + # Note: Batch size is variable over time, sometimes 1024, sometimes 8K... + hidden = observations[:,:-self.additional_obs_size].reshape((-1, 11, 11, self.block_size, self.block_size)).permute((0, 1,3,2,4)).reshape((-1, 11*self.block_size, 11*self.block_size)).view((-1,1,11*self.block_size, 11*self.block_size)).float() + #hidden = torch.zeros((observations.shape[0],1,55,55), device='cuda') + if not( torch.all(hidden>=0) and torch.all(hidden<100)): + print("Out-of-range visual observations") + pdb.set_trace() + #debug, use if the observation "pixels" are actually all set to tile number to get symbolic inputs: + #hidden = hidden[:, 0, ::5, ::5].long() + #hidden = F.one_hot(hidden, 32).permute(0, 3, 1, 2).float() + else: + hidden = observations.view(-1, 11, 11).long() + hidden = F.one_hot(hidden, 32).permute(0, 3, 1, 2).float() + #debugging + #out = hidden + #for part in self.network.children(): + # out=part(out) + # print(part, "Output shape:", out.shape) + #pdb.set_trace() + + hidden = self.mixer_network(torch.cat((self.network(hidden), self.additional_network(observations[:,-self.additional_obs_size:].float())) , dim=1)) + #hidden = self.network(hidden) + self.additional_network(observations[:,-self.additional_obs_size:].float()) + + #hidden = torch.zeros((observations.shape[0], 512), device='cuda') + + return hidden + + def decode_actions(self, flat_hidden, state=None): + value = self.value_fn(flat_hidden) + if self.is_continuous: + mean = self.decoder_mean(flat_hidden) + logstd = self.decoder_logstd.expand_as(mean) + std = torch.exp(logstd) + probs = torch.distributions.Normal(mean, std) + batch = flat_hidden.shape[0] + return probs, value + else: + action = self.actor(flat_hidden) + return action, value + class Grid(nn.Module): def __init__(self, env, cnn_channels=32, hidden_size=128, **kwargs): super().__init__() @@ -557,6 +846,7 @@ def __init__( num_drones: int = 2, continuous: bool = False, is_training: bool = True, + device: str = "cuda", **kwargs, ): super().__init__() @@ -574,13 +864,13 @@ def __init__( + [self.obsInfo.wallTypes + 1] * self.obsInfo.numFloatingWallObs + [self.numDrones + 1] * self.obsInfo.numProjectileObs, ) - discreteOffsets = torch.tensor([0] + list(np.cumsum(self.discreteFactors)[:-1])).view( + discreteOffsets = torch.tensor([0] + list(np.cumsum(self.discreteFactors)[:-1]), device=device).view( 1, -1 ) self.register_buffer("discreteOffsets", discreteOffsets, persistent=False) self.discreteMultihotDim = self.discreteFactors.sum() - multihotBuffer = torch.zeros(batch_size, self.discreteMultihotDim) + multihotBuffer = torch.zeros(batch_size, self.discreteMultihotDim, device=device) self.register_buffer("multihotOutput", multihotBuffer, persistent=False) # most of the observation is a 2D array of bytes, but the end @@ -743,38 +1033,44 @@ def _computeCNNShape(self) -> int: t = torch.as_tensor(mapSpace.sample()[None]) return self.mapCNN(t).shape[1] -class Drive(nn.Module): +class GPUDrive(nn.Module): def __init__(self, env, input_size=128, hidden_size=128, **kwargs): super().__init__() self.hidden_size = hidden_size self.ego_encoder = nn.Sequential( pufferlib.pytorch.layer_init( - nn.Linear(7, input_size)), - nn.LayerNorm(input_size), + nn.Linear(6, input_size)), # nn.ReLU(), - pufferlib.pytorch.layer_init( - nn.Linear(input_size, input_size)) + # pufferlib.pytorch.layer_init( + # nn.Linear(input_size, input_size)) ) max_road_objects = 13 self.road_encoder = nn.Sequential( pufferlib.pytorch.layer_init( nn.Linear(max_road_objects, input_size)), - nn.LayerNorm(input_size), # nn.ReLU(), - pufferlib.pytorch.layer_init( - nn.Linear(input_size, input_size)) + # pufferlib.pytorch.layer_init( + # nn.Linear(input_size, input_size)) ) max_partner_objects = 7 self.partner_encoder = nn.Sequential( pufferlib.pytorch.layer_init( nn.Linear(max_partner_objects, input_size)), - nn.LayerNorm(input_size), # nn.ReLU(), - pufferlib.pytorch.layer_init( - nn.Linear(input_size, input_size)) + # pufferlib.pytorch.layer_init( + # nn.Linear(input_size, input_size)) ) - + ''' + self.post_mask_road_encoder = nn.Sequential( + pufferlib.pytorch.layer_init( + nn.Linear(input_size, input_size)), + ) + self.post_mask_partner_encoder = nn.Sequential( + pufferlib.pytorch.layer_init( + nn.Linear(input_size, input_size)), + ) + ''' self.shared_embedding = nn.Sequential( nn.GELU(), pufferlib.pytorch.layer_init(nn.Linear(3*input_size, hidden_size)), @@ -796,7 +1092,7 @@ def forward_train(self, x, state=None): return self.forward(x, state) def encode_observations(self, observations, state=None): - ego_dim = 7 + ego_dim = 6 partner_dim = 63 * 7 road_dim = 200*7 ego_obs = observations[:, :ego_dim] @@ -809,6 +1105,7 @@ def encode_observations(self, observations, state=None): road_categorical = road_objects[:, :, 6] road_onehot = F.one_hot(road_categorical.long(), num_classes=7) # Shape: [batch, 200, 7] road_objects = torch.cat([road_continuous, road_onehot], dim=2) + ego_features = self.ego_encoder(ego_obs) partner_features, _ = self.partner_encoder(partner_objects).max(dim=1) road_features, _ = self.road_encoder(road_objects).max(dim=1) @@ -826,6 +1123,81 @@ def decode_actions(self, flat_hidden): value = self.value_fn(flat_hidden) return action, value +class Tetris(nn.Module): + def __init__( + self, + env, + cnn_channels=32, + input_size=128, + hidden_size=128, + **kwargs + ): + super().__init__() + self.hidden_size = hidden_size + self.cnn_channels = cnn_channels + self.n_cols = env.n_cols + self.n_rows = env.n_rows + self.scalar_input_size = (6 + 7 * (env.deck_size + 1)) + self.flat_conv_size = cnn_channels * 3 * 10 + self.is_continuous = isinstance(env.single_action_space, pufferlib.spaces.Box) + + self.conv_grid = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Conv2d(2, cnn_channels, kernel_size=(5, 3), stride=(2,1), padding=(2,1))), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, kernel_size=(5, 3), stride=(2,1), padding=(2,1))), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Conv2d(cnn_channels, cnn_channels, kernel_size=(5, 5), stride=(2,1), padding=(2,2))), + nn.ReLU(), + nn.Flatten(), + pufferlib.pytorch.layer_init(nn.Linear(self.flat_conv_size, input_size)), + ) + + self.fc_scalar = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(self.scalar_input_size, input_size)), + nn.ReLU(), + ) + + self.proj = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(2 * input_size, hidden_size)), + nn.ReLU(), + ) + + self.actor = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 7), std=0.01), + nn.Flatten() + ) + + self.value_fn = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1)), + nn.ReLU(), + ) + + def forward(self, observations, state=None): + hidden = self.encode_observations(observations) + actions, value = self.decode_actions(hidden) + return actions, value + + def forward_train(self, x, state=None): + return self.forward(x, state) + + def encode_observations(self, observations, state=None): + B = observations.shape[0] + grid_info = observations[:, 0:(self.n_cols * self.n_rows)].view(B, self.n_rows, self.n_cols) # (B, n_rows, n_cols) + grid_info = torch.stack([(grid_info == 1).float(), (grid_info == 2).float()], dim=1) # (B, 2, n_rows, n_cols) + scalar_info = observations[:, (self.n_cols * self.n_rows):(self.n_cols * self.n_rows + self.scalar_input_size)].float() + + grid_feat = self.conv_grid(grid_info) # (B, input_size) + scalar_feat = self.fc_scalar(scalar_info) # (B, input_size) + + combined = torch.cat([grid_feat, scalar_feat], dim=-1) # (B, 2 * input_size) + features = self.proj(combined) # (B, hidden_size) + return features + + def decode_actions(self, hidden): + action = self.actor(hidden) # (B, 4 * n_cols) + value = self.value_fn(hidden) # (B, 1) + return action, value + class Drone(nn.Module): ''' Drone policy. Flattens obs and applies a linear layer. ''' @@ -901,65 +1273,3 @@ def decode_actions(self, hidden): values = self.value(hidden) return logits, values - - -class G2048(nn.Module): - def __init__(self, env, hidden_size=128): - super().__init__() - self.hidden_size = hidden_size - self.is_continuous = False - - num_obs = np.prod(env.single_observation_space.shape) - - if hidden_size <= 256: - self.encoder = torch.nn.Sequential( - pufferlib.pytorch.layer_init(nn.Linear(num_obs, 512)), - nn.GELU(), - pufferlib.pytorch.layer_init(nn.Linear(512, 256)), - nn.GELU(), - pufferlib.pytorch.layer_init(nn.Linear(256, hidden_size)), - nn.GELU(), - ) - else: - self.encoder = torch.nn.Sequential( - pufferlib.pytorch.layer_init(nn.Linear(num_obs, 2*hidden_size)), - nn.GELU(), - pufferlib.pytorch.layer_init(nn.Linear(2*hidden_size, hidden_size)), - nn.GELU(), - pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), - nn.GELU(), - ) - - num_atns = env.single_action_space.n - self.decoder = torch.nn.Sequential( - pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), - nn.GELU(), - pufferlib.pytorch.layer_init(nn.Linear(hidden_size, num_atns), std=0.01), - ) - self.value = torch.nn.Sequential( - pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), - nn.GELU(), - pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1.0), - ) - - def forward_eval(self, observations, state=None): - hidden = self.encode_observations(observations, state=state) - logits, values = self.decode_actions(hidden) - return logits, values - - def forward(self, observations, state=None): - return self.forward_eval(observations, state) - - def encode_observations(self, observations, state=None): - batch_size = observations.shape[0] - observations = observations.view(batch_size, -1).float() - - # Scale the feat 1 (tile**1.5) - observations[:, :16] = observations[:, :16] / 100.0 - - return self.encoder(observations) - - def decode_actions(self, hidden): - logits = self.decoder(hidden) - values = self.value(hidden) - return logits, values From 084a08aeb2409b094ea7e48a119625a31029ee49 Mon Sep 17 00:00:00 2001 From: Thomas Miconi Date: Mon, 2 Mar 2026 15:38:41 -0800 Subject: [PATCH 2/2] Minor fix in Readme --- pufferlib/ocean/grixel/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pufferlib/ocean/grixel/README.md b/pufferlib/ocean/grixel/README.md index d2baa87772..7e0e352a98 100644 --- a/pufferlib/ocean/grixel/README.md +++ b/pufferlib/ocean/grixel/README.md @@ -20,7 +20,7 @@ Crucially, the agent can also perceive previous-step reward as part of its input The encoder is a CNN where the input layer has both kernel size and stride equal to block_size: the first convolution thus separately maps each block of the gridworld into a single vector. -The experiment works with the standard LSTM from PufferLib's Recurrent model. We also implemented a transformer and a plastic LSTM, with the plastic LSTM performing best by far in this simple visual memory task. These are not included here as they require modifying the rest of the PufferLib code (though you can see these *highly experimental* implementations [there](https://github.com/ThomasMiconi/PufferLib/blob/grixel/pufferlib/models.py)). +The experiment works with the standard LSTM from PufferLib's Recurrent model. We also implemented a transformer and a plastic LSTM, with the plastic LSTM performing best by far in this simple visual memory task. These are not included here as they require modifying the rest of the PufferLib code (though you can see these *highly experimental* implementations [there](https://github.com/ThomasMiconi/PufferLib_dev/blob/grixel/pufferlib/models.py)). Notably, all episodes have the same lengths, equal to the backpropagation-through-time horizon of the PPO training loop. This avoids difficulties with changing environments and ensures each episode starts with a reset hidden state during training.