PufferAI · valtterivalo · Mar 26, 2026
diff --git a/config/ocean/osrs_inferno.ini b/config/ocean/osrs_inferno.ini
@@ -0,0 +1,186 @@
+# Metal osrs_inferno config.
+# long episodes (300-2000+ ticks), 7 action heads (76 logits), 380+76 obs.
+# vf_coef must stay low (< 0.15) — fused decoder amplifies value gradients
+# into policy logits via MinGRU scan backward. replay_ratio < 1.0 to avoid
+# stale target drift.
+
+[base]
+env_name = osrs_inferno
+score_metric = episode_return
+
+[env]
+start_wave = 0.0
+mask_in_obs = 1.0
+
+[vec]
+total_agents = 2048
+num_buffers = 4
+num_threads = 4
+
+[policy]
+hidden_size = 256
+num_layers = 2
+
+[train]
+# anchor from sweep trial #33 (score 74.6, wave 17+, prayer 60%)
+total_timesteps = 400000000
+horizon = 32
+min_lr_ratio = 0.003872
+learning_rate = 0.003069
+beta1 = 0.95
+eps = 0.000004
+ent_coef = 0.0017
+gamma = 0.998319
+gae_lambda = 0.8
+vtrace_rho_clip = 2.243133
+vtrace_c_clip = 1.971016
+prio_alpha = 0.0
+prio_beta0 = 0.275787
+clip_coef = 0.611932
+vf_coef = 0.063963
+vf_clip_coef = 0.404894
+max_grad_norm = 0.997781
+replay_ratio = 0.790328
+minibatch_size = 4096
+ns_iters = 5
+weight_decay = 0.089232
+
+[sweep]
+min_sps = 50000
+max_suggestion_cost = 3600
+metric = episode_return
+metric_distribution = linear
+
+[sweep.train.horizon]
+distribution = uniform_pow2
+min = 32
+max = 256
+scale = auto
+
+[sweep.train.learning_rate]
+distribution = log_normal
+min = 0.0003
+max = 0.01
+scale = 0.5
+
+[sweep.train.ent_coef]
+distribution = log_normal
+min = 0.001
+max = 0.03
+scale = auto
+
+[sweep.train.gamma]
+distribution = logit_normal
+min = 0.99
+max = 0.9999
+scale = auto
+
+[sweep.train.min_lr_ratio]
+distribution = uniform
+min = 0.0
+max = 0.3
+scale = auto
+
+[sweep.train.beta1]
+distribution = uniform
+min = 0.8
+max = 0.99
+scale = auto
+
+[sweep.train.eps]
+distribution = log_normal
+min = 1e-6
+max = 1e-4
+scale = auto
+
+[sweep.train.gae_lambda]
+distribution = logit_normal
+min = 0.5
+max = 0.999
+scale = auto
+
+[sweep.train.vtrace_rho_clip]
+distribution = uniform
+min = 1.0
+max = 3.0
+scale = auto
+
+[sweep.train.vtrace_c_clip]
+distribution = uniform
+min = 1.0
+max = 2.5
+scale = auto
+
+[sweep.train.prio_alpha]
+distribution = logit_normal
+min = 0.0
+max = 0.8
+scale = auto
+
+[sweep.train.prio_beta0]
+distribution = logit_normal
+min = 0.01
+max = 0.8
+scale = auto
+
+[sweep.train.clip_coef]
+distribution = uniform
+min = 0.2
+max = 1.0
+scale = auto
+
+[sweep.train.vf_coef]
+distribution = log_normal
+min = 0.005
+max = 0.5
+scale = auto
+
+[sweep.train.vf_clip_coef]
+distribution = uniform
+min = 0.1
+max = 2.0
+scale = auto
+
+[sweep.train.max_grad_norm]
+distribution = uniform
+min = 0.5
+max = 3.0
+scale = auto
+
+[sweep.train.replay_ratio]
+distribution = uniform
+min = 0.1
+max = 1.0
+scale = auto
+
+[sweep.train.weight_decay]
+distribution = log_normal
+min = 0.001
+max = 0.3
+scale = auto
+
+[sweep.train.minibatch_size]
+distribution = uniform_pow2
+min = 2048
+max = 8192
+scale = auto
+
+[sweep.train.num_buffers]
+distribution = uniform_pow2
+min = 1
+max = 4
+scale = auto
+
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 128
+max = 512
+scale = auto
+
+[sweep.policy.num_layers]
+distribution = uniform
+min = 2
+max = 5.0
+scale = auto
+
+
diff --git a/config/ocean/osrs_pvp.ini b/config/ocean/osrs_pvp.ini
@@ -0,0 +1,28 @@
+[base]
+env_name = osrs_pvp
+score_metric = score
+
+[env]
+start_wave = 0.0
+mask_in_obs = 1.0
+
+[vec]
+total_agents = 2048
+num_buffers = 4
+num_threads = 4
+
+[policy]
+hidden_size = 256
+num_layers = 2
+
+[train]
+total_timesteps = 200000000
+horizon = 32
+learning_rate = 0.003
+gamma = 0.998
+ent_coef = 0.001
+clip_coef = 0.6
+vf_coef = 0.1
+replay_ratio = 0.5
+minibatch_size = 4096
+weight_decay = 0.05
diff --git a/config/ocean/osrs_zulrah.ini b/config/ocean/osrs_zulrah.ini
@@ -0,0 +1,28 @@
+[base]
+env_name = osrs_zulrah
+score_metric = score
+
+[env]
+start_wave = 0.0
+mask_in_obs = 1.0
+
+[vec]
+total_agents = 2048
+num_buffers = 4
+num_threads = 4
+
+[policy]
+hidden_size = 256
+num_layers = 2
+
+[train]
+total_timesteps = 200000000
+horizon = 32
+learning_rate = 0.003
+gamma = 0.998
+ent_coef = 0.001
+clip_coef = 0.6
+vf_coef = 0.1
+replay_ratio = 0.5
+minibatch_size = 4096
+weight_decay = 0.05
diff --git a/ocean/osrs/Makefile b/ocean/osrs/Makefile
@@ -0,0 +1,43 @@
+# OSRS PvP C Environment Makefile
+#
+# standalone targets (no PufferLib dependency):
+#   make           — headless benchmark binary
+#   make visual    — headed raylib viewer with human input
+#   make debug     — debug build with sanitizers
+#
+# PufferLib training uses setup.py build_osrs instead.
+
+CC = clang
+CFLAGS = -Wall -Wextra -O3 -ffast-math -flto -fPIC -std=c11
+DEBUG_FLAGS = -Wall -Wextra -g -O0 -fPIC -std=c11 -DDEBUG
+LDFLAGS = -lm
+
+TARGET = osrs_pvp
+DEMO_SRC = osrs_pvp.c
+HEADERS = osrs_pvp.h
+
+# Raylib (for visual target). download from https://github.com/raysan5/raylib/releases
+RAYLIB_DIR = raylib-5.5_macos
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+RAYLIB_FLAGS = -I$(RAYLIB_DIR)/include $(RAYLIB_DIR)/lib/libraylib.a \
+               -framework Cocoa -framework OpenGL -framework IOKit -framework CoreVideo
+else
+RAYLIB_FLAGS = -I$(RAYLIB_DIR)/include -L$(RAYLIB_DIR)/lib -lraylib -lGL -lpthread -ldl -lrt
+endif
+
+.PHONY: all clean debug visual
+
+all: $(TARGET)
+
+$(TARGET): $(DEMO_SRC) $(HEADERS)
+	$(CC) $(CFLAGS) -o $@ $(DEMO_SRC) $(LDFLAGS)
+
+visual: $(DEMO_SRC) $(HEADERS) osrs_pvp_render.h osrs_pvp_gui.h
+	$(CC) $(CFLAGS) -DOSRS_PVP_VISUAL $(RAYLIB_FLAGS) -o $(TARGET)_visual $(DEMO_SRC) $(LDFLAGS)
+
+debug: $(DEMO_SRC) $(HEADERS)
+	$(CC) $(DEBUG_FLAGS) -o $(TARGET)_debug $(DEMO_SRC) $(LDFLAGS)
+
+clean:
+	rm -f $(TARGET) $(TARGET)_debug $(TARGET)_visual *.o
diff --git a/ocean/osrs/README.md b/ocean/osrs/README.md
@@ -0,0 +1,52 @@
+# OSRS PvP Environment
+
+C implementation of Old School RuneScape NH PvP for reinforcement learning.
+~1.1M env steps/sec standalone, ~235K+ training SPS on Metal.
+
+## Build and train
+
+```bash
+python setup.py build_osrs_pvp --inplace --force
+python train_pvp.py --no-wandb --total-timesteps 50000000
+
+# zulrah (separate build, overwrites _C.so)
+python setup.py build_osrs_zulrah --inplace --force
+python train_zulrah.py --no-wandb --total-timesteps 500000000
+```
+
+Two bindings: `binding.c` (metal vecenv.h) and `ocean_binding.c` (PufferLib env_binding.h).
+
+## Data assets
+
+Not in git. Exported from the OSRS game cache:
+
+1. Download a modern cache from https://archive.openrs2.org/ ("flat file" export)
+2. `cd pufferlib/ocean/osrs_pvp && ./scripts/export_all.sh /path/to/cache`
+
+Pure Python, no deps.
+
+## Spaces
+
+**Obs:** 373 = 334 features + 39 action mask, normalized in C.
+
+**Actions:** MultiDiscrete `[9, 13, 6, 2, 5, 2, 2]` — loadout, combat, prayer, food, potion, karambwan, veng.
+
+**Timing:** tick N actions apply at tick N+1 (OSRS-accurate async).
+
+## Opponents
+
+28 scripted policies from trivial (`true_random`) to boss (`nightmare_nh` — onetick + 50% action reading). Curriculum mixes and PFSP supported.
+
+## Encounters
+
+Vtable interface (`osrs_encounter.h`). Current: NH PvP, Zulrah (81 obs, 6 heads, 3 forms, venom, clouds, collision).
+
+## Files
+
+Core env: `osrs_types/items + osrs_pvp_gear/combat/collision/pathfinding/movement/observations/actions/opponents/api.h`
+
+Visual: `osrs_pvp_render/gui/anim/models/terrain/objects/effects/human_input.h`
+
+Encounters: `encounters/encounter_nh_pvp.h`, `encounters/encounter_zulrah.h`
+
+Data: `data/` (gitignored binaries + C model headers), `scripts/` (cache exporters)