Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 186 additions & 0 deletions config/ocean/osrs_inferno.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
# Metal osrs_inferno config.
# long episodes (300-2000+ ticks), 7 action heads (76 logits), 380+76 obs.
# vf_coef must stay low (< 0.15) — fused decoder amplifies value gradients
# into policy logits via MinGRU scan backward. replay_ratio < 1.0 to avoid
# stale target drift.

[base]
env_name = osrs_inferno
score_metric = episode_return

[env]
start_wave = 0.0
mask_in_obs = 1.0

[vec]
total_agents = 2048
num_buffers = 4
num_threads = 4

[policy]
hidden_size = 256
num_layers = 2

[train]
# anchor from sweep trial #33 (score 74.6, wave 17+, prayer 60%)
total_timesteps = 400000000
horizon = 32
min_lr_ratio = 0.003872
learning_rate = 0.003069
beta1 = 0.95
eps = 0.000004
ent_coef = 0.0017
gamma = 0.998319
gae_lambda = 0.8
vtrace_rho_clip = 2.243133
vtrace_c_clip = 1.971016
prio_alpha = 0.0
prio_beta0 = 0.275787
clip_coef = 0.611932
vf_coef = 0.063963
vf_clip_coef = 0.404894
max_grad_norm = 0.997781
replay_ratio = 0.790328
minibatch_size = 4096
ns_iters = 5
weight_decay = 0.089232

[sweep]
min_sps = 50000
max_suggestion_cost = 3600
metric = episode_return
metric_distribution = linear

[sweep.train.horizon]
distribution = uniform_pow2
min = 32
max = 256
scale = auto

[sweep.train.learning_rate]
distribution = log_normal
min = 0.0003
max = 0.01
scale = 0.5

[sweep.train.ent_coef]
distribution = log_normal
min = 0.001
max = 0.03
scale = auto

[sweep.train.gamma]
distribution = logit_normal
min = 0.99
max = 0.9999
scale = auto

[sweep.train.min_lr_ratio]
distribution = uniform
min = 0.0
max = 0.3
scale = auto

[sweep.train.beta1]
distribution = uniform
min = 0.8
max = 0.99
scale = auto

[sweep.train.eps]
distribution = log_normal
min = 1e-6
max = 1e-4
scale = auto

[sweep.train.gae_lambda]
distribution = logit_normal
min = 0.5
max = 0.999
scale = auto

[sweep.train.vtrace_rho_clip]
distribution = uniform
min = 1.0
max = 3.0
scale = auto

[sweep.train.vtrace_c_clip]
distribution = uniform
min = 1.0
max = 2.5
scale = auto

[sweep.train.prio_alpha]
distribution = logit_normal
min = 0.0
max = 0.8
scale = auto

[sweep.train.prio_beta0]
distribution = logit_normal
min = 0.01
max = 0.8
scale = auto

[sweep.train.clip_coef]
distribution = uniform
min = 0.2
max = 1.0
scale = auto

[sweep.train.vf_coef]
distribution = log_normal
min = 0.005
max = 0.5
scale = auto

[sweep.train.vf_clip_coef]
distribution = uniform
min = 0.1
max = 2.0
scale = auto

[sweep.train.max_grad_norm]
distribution = uniform
min = 0.5
max = 3.0
scale = auto

[sweep.train.replay_ratio]
distribution = uniform
min = 0.1
max = 1.0
scale = auto

[sweep.train.weight_decay]
distribution = log_normal
min = 0.001
max = 0.3
scale = auto

[sweep.train.minibatch_size]
distribution = uniform_pow2
min = 2048
max = 8192
scale = auto

[sweep.train.num_buffers]
distribution = uniform_pow2
min = 1
max = 4
scale = auto

[sweep.policy.hidden_size]
distribution = uniform_pow2
min = 128
max = 512
scale = auto

[sweep.policy.num_layers]
distribution = uniform
min = 2
max = 5.0
scale = auto


28 changes: 28 additions & 0 deletions config/ocean/osrs_pvp.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[base]
env_name = osrs_pvp
score_metric = score

[env]
start_wave = 0.0
mask_in_obs = 1.0

[vec]
total_agents = 2048
num_buffers = 4
num_threads = 4

[policy]
hidden_size = 256
num_layers = 2

[train]
total_timesteps = 200000000
horizon = 32
learning_rate = 0.003
gamma = 0.998
ent_coef = 0.001
clip_coef = 0.6
vf_coef = 0.1
replay_ratio = 0.5
minibatch_size = 4096
weight_decay = 0.05
28 changes: 28 additions & 0 deletions config/ocean/osrs_zulrah.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[base]
env_name = osrs_zulrah
score_metric = score

[env]
start_wave = 0.0
mask_in_obs = 1.0

[vec]
total_agents = 2048
num_buffers = 4
num_threads = 4

[policy]
hidden_size = 256
num_layers = 2

[train]
total_timesteps = 200000000
horizon = 32
learning_rate = 0.003
gamma = 0.998
ent_coef = 0.001
clip_coef = 0.6
vf_coef = 0.1
replay_ratio = 0.5
minibatch_size = 4096
weight_decay = 0.05
43 changes: 43 additions & 0 deletions ocean/osrs/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# OSRS PvP C Environment Makefile
#
# standalone targets (no PufferLib dependency):
# make — headless benchmark binary
# make visual — headed raylib viewer with human input
# make debug — debug build with sanitizers
#
# PufferLib training uses setup.py build_osrs instead.

CC = clang
CFLAGS = -Wall -Wextra -O3 -ffast-math -flto -fPIC -std=c11
DEBUG_FLAGS = -Wall -Wextra -g -O0 -fPIC -std=c11 -DDEBUG
LDFLAGS = -lm

TARGET = osrs_pvp
DEMO_SRC = osrs_pvp.c
HEADERS = osrs_pvp.h

# Raylib (for visual target). download from https://github.com/raysan5/raylib/releases
RAYLIB_DIR = raylib-5.5_macos
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
RAYLIB_FLAGS = -I$(RAYLIB_DIR)/include $(RAYLIB_DIR)/lib/libraylib.a \
-framework Cocoa -framework OpenGL -framework IOKit -framework CoreVideo
else
RAYLIB_FLAGS = -I$(RAYLIB_DIR)/include -L$(RAYLIB_DIR)/lib -lraylib -lGL -lpthread -ldl -lrt
endif

.PHONY: all clean debug visual

all: $(TARGET)

$(TARGET): $(DEMO_SRC) $(HEADERS)
$(CC) $(CFLAGS) -o $@ $(DEMO_SRC) $(LDFLAGS)

visual: $(DEMO_SRC) $(HEADERS) osrs_pvp_render.h osrs_pvp_gui.h
$(CC) $(CFLAGS) -DOSRS_PVP_VISUAL $(RAYLIB_FLAGS) -o $(TARGET)_visual $(DEMO_SRC) $(LDFLAGS)

debug: $(DEMO_SRC) $(HEADERS)
$(CC) $(DEBUG_FLAGS) -o $(TARGET)_debug $(DEMO_SRC) $(LDFLAGS)

clean:
rm -f $(TARGET) $(TARGET)_debug $(TARGET)_visual *.o
52 changes: 52 additions & 0 deletions ocean/osrs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# OSRS PvP Environment

C implementation of Old School RuneScape NH PvP for reinforcement learning.
~1.1M env steps/sec standalone, ~235K+ training SPS on Metal.

## Build and train

```bash
python setup.py build_osrs_pvp --inplace --force
python train_pvp.py --no-wandb --total-timesteps 50000000

# zulrah (separate build, overwrites _C.so)
python setup.py build_osrs_zulrah --inplace --force
python train_zulrah.py --no-wandb --total-timesteps 500000000
```

Two bindings: `binding.c` (metal vecenv.h) and `ocean_binding.c` (PufferLib env_binding.h).

## Data assets

Not in git. Exported from the OSRS game cache:

1. Download a modern cache from https://archive.openrs2.org/ ("flat file" export)
2. `cd pufferlib/ocean/osrs_pvp && ./scripts/export_all.sh /path/to/cache`

Pure Python, no deps.

## Spaces

**Obs:** 373 = 334 features + 39 action mask, normalized in C.

**Actions:** MultiDiscrete `[9, 13, 6, 2, 5, 2, 2]` — loadout, combat, prayer, food, potion, karambwan, veng.

**Timing:** tick N actions apply at tick N+1 (OSRS-accurate async).

## Opponents

28 scripted policies from trivial (`true_random`) to boss (`nightmare_nh` — onetick + 50% action reading). Curriculum mixes and PFSP supported.

## Encounters

Vtable interface (`osrs_encounter.h`). Current: NH PvP, Zulrah (81 obs, 6 heads, 3 forms, venom, clouds, collision).

## Files

Core env: `osrs_types/items + osrs_pvp_gear/combat/collision/pathfinding/movement/observations/actions/opponents/api.h`

Visual: `osrs_pvp_render/gui/anim/models/terrain/objects/effects/human_input.h`

Encounters: `encounters/encounter_nh_pvp.h`, `encounters/encounter_zulrah.h`

Data: `data/` (gitignored binaries + C model headers), `scripts/` (cache exporters)
Loading