diff --git a/examples/pufferl_clifford_curriculum.py b/examples/pufferl_clifford_curriculum.py new file mode 100644 index 0000000000..38f593e8ba --- /dev/null +++ b/examples/pufferl_clifford_curriculum.py @@ -0,0 +1,495 @@ +#!/usr/bin/env python3 + +import argparse +import math +import os +import sys +from collections import defaultdict + +import numpy as np +import torch + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) + +from pufferlib.ocean.clifford import Clifford +from pufferlib.pytorch import layer_init + +N_QUBITS = 4 +START_DIFFICULTY = 1.0 +MAX_DIFFICULTY = 1000.0 +SUCCESS_THRESHOLD = 0.90 +DIFFICULTY_STEP = 0.25 +SINGLE_QUBIT_COST = 0.01 +GOAL_BONUS = 0.0 +REWARD_MODE = "hamming_left" +HAMMING_LEFT_SCALE = 0.5 + +NUM_ENVS = 2048 +TOTAL_TIMESTEPS = 50_000_000 +LEARNING_RATE = 2.5e-4 +HIDDEN_SIZE = 256 +HIDDEN_LAYERS = 2 +N_STEPS = 128 +MINIBATCH_SIZE = 8192 +UPDATE_EPOCHS = 4 +GAMMA = 0.99 +GAE_LAMBDA = 0.95 +ENT_COEF = 0.0 +CLIP_COEF = 0.15 +VF_CLIP_COEF = 0.2 +TARGET_KL = 0.0 + +EVAL_FREQ_STEPS = 98_304 +PROGRESSION_EVAL_EPISODES = 50 +GREEDY_EVAL_DIFFICULTY_CUTOFF = 20.0 +MIN_ADVANCE_EVALS = 1 +EMA_SR_ALPHA = 0.9 +MAX_STEPS_SLACK = 64 +MIN_MAX_STEPS = 32 +FORCE_MAX_STEPS_AFTER_DIFFICULTY = 12.0 +FORCE_MAX_STEPS_VALUE = 1000 + +TRAIN_CONFIG_DEFAULTS = { + "env": "puffer_clifford", + "torch_deterministic": True, + "cpu_offload": False, + "optimizer": "muon", + "precision": "float32", + "anneal_lr": True, + "min_lr_ratio": 0.0, + "vf_coef": 2.0, + "max_grad_norm": 1.5, + "adam_beta1": 0.95, + "adam_beta2": 0.999, + "adam_eps": 1e-12, + "data_dir": "experiments", + "checkpoint_interval": 200, + "max_minibatch_size": 32768, + "compile": False, + "use_rnn": False, + "vtrace_rho_clip": 1.0, + "vtrace_c_clip": 1.0, + "prio_alpha": 0.8, + "prio_beta0": 0.2, +} + +PANEL_STAT_KEYS = { + "curriculum/difficulty", + "curriculum/rollout_success_rate", + "curriculum/decision_success_rate", + "curriculum/ema_success_rate", + "curriculum/mean_reward", + "curriculum/mean_cz", + "curriculum/evals_since_advance", + "curriculum/consecutive_above_threshold", + "curriculum/ent_coef", + "perf", + "score", + "episode_return", + "episode_length", + "mean_cz", + "success_rate", +} + + +def default_device(): + return "cuda" if torch.cuda.is_available() else "cpu" + + +class Policy(torch.nn.Module): + def __init__(self, env, hidden_size=HIDDEN_SIZE, hidden_layers=HIDDEN_LAYERS): + super().__init__() + obs_size = int(np.prod(env.single_observation_space.shape)) + num_actions = int(env.single_action_space.n) + hidden_layers = max(int(hidden_layers), 1) + + backbone = [torch.nn.LayerNorm(obs_size)] + in_features = obs_size + for _ in range(hidden_layers): + backbone.append(layer_init(torch.nn.Linear(in_features, int(hidden_size)))) + backbone.append(torch.nn.GELU()) + in_features = int(hidden_size) + self.backbone = torch.nn.Sequential(*backbone) + self.action_head = torch.nn.Linear(in_features, num_actions) + self.value_head = torch.nn.Linear(in_features, 1) + layer_init(self.action_head, std=0.01) + layer_init(self.value_head, std=1.0) + + def forward_eval(self, observations, state=None): + hidden = self.backbone(observations.float().view(observations.shape[0], -1)) + logits = self.action_head(hidden) + values = self.value_head(hidden) + return logits, values + + def forward(self, observations, state=None): + return self.forward_eval(observations, state) + +def make_train_config(args): + return { + **TRAIN_CONFIG_DEFAULTS, + "total_timesteps": int(args.total_timesteps), + "learning_rate": float(args.learning_rate), + "batch_size": int(args.num_envs) * N_STEPS, + "bptt_horizon": N_STEPS, + "minibatch_size": max(MINIBATCH_SIZE, N_STEPS), + "update_epochs": UPDATE_EPOCHS, + "device": args.device, + "seed": int(args.seed), + "gamma": GAMMA, + "gae_lambda": GAE_LAMBDA, + "ent_coef": float(args.ent_coef), + "clip_coef": CLIP_COEF, + "vf_clip_coef": VF_CLIP_COEF, + "target_kl": TARGET_KL, + } + + +def make_env(num_envs, difficulty, max_steps, seed): + return Clifford( + num_envs=int(num_envs), + n_qubits=N_QUBITS, + difficulty=float(difficulty), + max_steps=int(max_steps), + single_qubit_cost=SINGLE_QUBIT_COST, + goal_bonus=GOAL_BONUS, + reward_mode=REWARD_MODE, + hamming_left_scale=HAMMING_LEFT_SCALE, + use_reset_pool=True, + seed=int(seed), + ) + + +def run_greedy_eval(policy, env, device, episodes, seed): + completed = 0 + successes = 0 + rewards = [] + completed_cz_counts = [] + + obs, _ = env.reset(seed=seed) + episode_rewards = np.zeros(env.num_agents, dtype=np.float32) + episode_cz_counts = np.zeros(env.num_agents, dtype=np.int32) + + policy_was_training = policy.training + policy.eval() + + with torch.no_grad(): + while completed < episodes: + obs_t = torch.as_tensor(obs, device=device) + logits, _values = policy.forward_eval(obs_t) + actions = torch.argmax(logits, dim=-1).cpu().numpy().astype(np.int32, copy=False) + obs, step_rewards, terminals, truncations, _info = env.step(actions) + episode_rewards += step_rewards + for idx, action in enumerate(actions): + if env._actions[int(action)][0] == "cz": + episode_cz_counts[idx] += 1 + + done_mask = np.logical_or(terminals, truncations) + done_indices = np.flatnonzero(done_mask) + for idx in done_indices: + completed += 1 + rewards.append(float(episode_rewards[idx])) + completed_cz_counts.append(int(episode_cz_counts[idx])) + if terminals[idx]: + successes += 1 + episode_rewards[idx] = 0.0 + episode_cz_counts[idx] = 0 + if completed >= episodes: + break + + if policy_was_training: + policy.train() + + return { + "success_rate": successes / max(completed, 1), + "mean_reward": float(np.mean(rewards)) if rewards else 0.0, + "mean_cz": float(np.mean(completed_cz_counts)) if completed_cz_counts else 0.0, + } + + +def _metric_values(rollout_stats, key): + values = rollout_stats.get(key, []) + if isinstance(values, np.ndarray): + values = values.tolist() + if isinstance(values, (list, tuple)): + return [float(v) for v in values] + if values is None: + return [] + return [float(values)] + + +def merge_rollout_stats(rollout_stats, extra_metrics): + merged = defaultdict(list) + for source in (rollout_stats, extra_metrics): + if not source: + continue + for key, value in source.items(): + if isinstance(value, np.ndarray): + value = value.tolist() + if isinstance(value, (list, tuple)): + merged[key].extend(value) + else: + merged[key].append(value) + return merged + + +def rollout_progression_metrics(rollout_stats): + if not rollout_stats: + return None + episode_count = sum(_metric_values(rollout_stats, "episode_count")) + if episode_count <= 0.0: + return None + + success_count = sum(_metric_values(rollout_stats, "success_count")) + reward_sum = sum(_metric_values(rollout_stats, "episode_return_sum")) + episode_cz_sum = sum(_metric_values(rollout_stats, "episode_cz_sum")) + return { + "success_rate": success_count / episode_count, + "mean_reward": reward_sum / episode_count, + "mean_cz": episode_cz_sum / episode_count, + } + + +def compute_max_steps(difficulty): + max_steps = int(math.ceil(max(float(difficulty), 0.0) - 1e-12)) + MAX_STEPS_SLACK + if ( + FORCE_MAX_STEPS_AFTER_DIFFICULTY is not None + and FORCE_MAX_STEPS_VALUE > 0 + and difficulty >= FORCE_MAX_STEPS_AFTER_DIFFICULTY + ): + max_steps = FORCE_MAX_STEPS_VALUE + return max(max_steps, MIN_MAX_STEPS, 1) + + +def update_curriculum_stats(trainer, values): + for key, value in values.items(): + trainer.stats[key] = value + trainer.last_stats[key] = value + + +def curriculum_stats( + difficulty, + rollout_success_rate, + decision_success_rate, + ema_success_rate, + mean_reward, + mean_cz, + evals_since_advance, + consecutive_above_threshold, + ent_coef, +): + return { + "curriculum/difficulty": float(difficulty), + "curriculum/mean_reward": float(mean_reward), + "curriculum/decision_success_rate": float(decision_success_rate), + "curriculum/ema_success_rate": float(ema_success_rate), + "curriculum/rollout_success_rate": float(rollout_success_rate), + "curriculum/mean_cz": float(mean_cz), + "curriculum/evals_since_advance": float(evals_since_advance), + "curriculum/consecutive_above_threshold": float(consecutive_above_threshold), + "curriculum/ent_coef": float(ent_coef), + } + + +def prune_panel_stats(stats): + if not stats: + return + for key in list(stats.keys()): + if key not in PANEL_STAT_KEYS: + del stats[key] + + +def install_panel_filter(trainer): + original_print_dashboard = trainer.print_dashboard + + def filtered_print_dashboard(*args, **kwargs): + if trainer.stats: + trainer.raw_stats = dict(trainer.stats) + elif trainer.last_stats: + trainer.raw_stats = dict(trainer.last_stats) + prune_panel_stats(trainer.stats) + prune_panel_stats(trainer.last_stats) + return original_print_dashboard(*args, **kwargs) + + trainer.print_dashboard = filtered_print_dashboard + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Minimal Ocean Clifford trainer focused on the current working 4-qubit regime" + ) + parser.add_argument("--device", default=default_device()) + parser.add_argument("--num-envs", type=int, default=NUM_ENVS) + parser.add_argument("--total-timesteps", type=int, default=TOTAL_TIMESTEPS) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--learning-rate", type=float, default=LEARNING_RATE) + parser.add_argument("--ent-coef", type=float, default=ENT_COEF) + parser.add_argument("--difficulty-step", type=float, default=DIFFICULTY_STEP) + return parser.parse_args() + + +def validate_args(args): + if args.num_envs <= 0: + raise ValueError("num_envs must be positive") + if args.total_timesteps < int(args.num_envs) * N_STEPS: + raise ValueError(f"total_timesteps must be at least {int(args.num_envs) * N_STEPS}") + if args.learning_rate <= 0.0: + raise ValueError("learning_rate must be positive") + if args.ent_coef < 0.0: + raise ValueError("ent_coef must be non-negative") + if args.difficulty_step <= 0.0: + raise ValueError("difficulty_step must be positive") + + +def main(): + args = parse_args() + validate_args(args) + from pufferlib import pufferl + + current_difficulty = START_DIFFICULTY + current_max_steps = compute_max_steps(current_difficulty) + + train_env = make_env(args.num_envs, current_difficulty, current_max_steps, args.seed) + eval_env = make_env( + min(PROGRESSION_EVAL_EPISODES, int(args.num_envs)), + current_difficulty, + current_max_steps, + args.seed + 10_000, + ) + + policy = Policy(train_env).to(args.device) + trainer = pufferl.PuffeRL(make_train_config(args), train_env, policy) + install_panel_filter(trainer) + + last_progress_step = 0 + ema_success_rate = 0.0 + ema_initialized = False + consecutive_above_threshold = 0 + evals_since_advance = 0 + last_rollout_success_rate = 0.0 + last_decision_success_rate = 0.0 + last_mean_reward = 0.0 + last_mean_cz = 0.0 + + update_curriculum_stats( + trainer, + curriculum_stats( + current_difficulty, + last_rollout_success_rate, + last_decision_success_rate, + ema_success_rate, + last_mean_reward, + last_mean_cz, + evals_since_advance, + consecutive_above_threshold, + trainer.config["ent_coef"], + ), + ) + trainer.print_dashboard(clear=True) + + try: + while trainer.epoch < trainer.total_epochs: + update_curriculum_stats( + trainer, + curriculum_stats( + current_difficulty, + last_rollout_success_rate, + last_decision_success_rate, + ema_success_rate, + last_mean_reward, + last_mean_cz, + evals_since_advance, + consecutive_above_threshold, + trainer.config["ent_coef"], + ), + ) + trainer.evaluate() + trainer.train() + + if trainer.global_step - last_progress_step < EVAL_FREQ_STEPS: + continue + last_progress_step = trainer.global_step + + rollout_stats = merge_rollout_stats(getattr(trainer, "raw_stats", None), train_env.flush_logs()) + rollout_metrics = rollout_progression_metrics(rollout_stats) + if rollout_metrics is None: + continue + + decision_metrics = rollout_metrics + if current_difficulty < GREEDY_EVAL_DIFFICULTY_CUTOFF: + eval_env.set_difficulty(current_difficulty) + eval_env.set_max_steps(current_max_steps) + decision_metrics = run_greedy_eval( + policy=policy, + env=eval_env, + device=args.device, + episodes=PROGRESSION_EVAL_EPISODES, + seed=args.seed + trainer.epoch, + ) + + raw_success_rate = float(rollout_metrics["success_rate"]) + if not ema_initialized: + ema_success_rate = raw_success_rate + ema_initialized = True + else: + ema_success_rate = ( + float(EMA_SR_ALPHA) * ema_success_rate + + (1.0 - float(EMA_SR_ALPHA)) * raw_success_rate + ) + + decision_success_rate = float(decision_metrics["success_rate"]) + advancement_success_rate = ( + decision_success_rate + if current_difficulty < GREEDY_EVAL_DIFFICULTY_CUTOFF + else ema_success_rate + ) + + if advancement_success_rate >= SUCCESS_THRESHOLD: + consecutive_above_threshold += 1 + else: + consecutive_above_threshold = 0 + + if ( + consecutive_above_threshold >= MIN_ADVANCE_EVALS + and current_difficulty < MAX_DIFFICULTY + ): + current_difficulty = min(MAX_DIFFICULTY, current_difficulty + args.difficulty_step) + current_max_steps = compute_max_steps(current_difficulty) + train_env.set_difficulty(current_difficulty) + train_env.set_max_steps(current_max_steps) + eval_env.set_difficulty(current_difficulty) + eval_env.set_max_steps(current_max_steps) + consecutive_above_threshold = 0 + evals_since_advance = 0 + else: + evals_since_advance += 1 + + last_rollout_success_rate = raw_success_rate + last_decision_success_rate = decision_success_rate + last_mean_reward = float(rollout_metrics["mean_reward"]) + last_mean_cz = float(rollout_metrics["mean_cz"]) + + update_curriculum_stats( + trainer, + curriculum_stats( + current_difficulty, + last_rollout_success_rate, + last_decision_success_rate, + ema_success_rate, + last_mean_reward, + last_mean_cz, + evals_since_advance, + consecutive_above_threshold, + trainer.config["ent_coef"], + ), + ) + + trainer.print_dashboard() + finally: + trainer.close() + eval_env.close() + + +if __name__ == "__main__": + main() diff --git a/pufferlib/ocean/clifford/__init__.py b/pufferlib/ocean/clifford/__init__.py new file mode 100644 index 0000000000..52850904ee --- /dev/null +++ b/pufferlib/ocean/clifford/__init__.py @@ -0,0 +1,11 @@ +from .reference import ReferenceCliffordEnv + +try: + from .clifford import Clifford +except ImportError: + Clifford = None + +__all__ = [ + "Clifford", + "ReferenceCliffordEnv", +] diff --git a/pufferlib/ocean/clifford/binding.c b/pufferlib/ocean/clifford/binding.c new file mode 100644 index 0000000000..8f7b6718ba --- /dev/null +++ b/pufferlib/ocean/clifford/binding.c @@ -0,0 +1,411 @@ +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION + +#include +#include +#include + +#include "clifford.h" + +static int require_contiguous_array(PyObject* obj, int typenum, int ndim, const char* name, PyArrayObject** out) { + if (!PyObject_TypeCheck(obj, &PyArray_Type)) { + PyErr_Format(PyExc_TypeError, "%s must be a NumPy array", name); + return 0; + } + PyArrayObject* arr = (PyArrayObject*)obj; + if (!PyArray_ISCONTIGUOUS(arr)) { + PyErr_Format(PyExc_ValueError, "%s must be contiguous", name); + return 0; + } + if (PyArray_TYPE(arr) != typenum) { + PyErr_Format(PyExc_TypeError, "%s has unexpected dtype", name); + return 0; + } + if (ndim >= 0 && PyArray_NDIM(arr) != ndim) { + PyErr_Format(PyExc_ValueError, "%s must have ndim=%d", name, ndim); + return 0; + } + *out = arr; + return 1; +} + +static PyObject* require_kwarg(PyObject* kwargs, const char* name) { + PyObject* obj = PyDict_GetItemString(kwargs, name); + if (obj == NULL) { + PyErr_Format(PyExc_TypeError, "missing required keyword argument '%s'", name); + } + return obj; +} + +static CliffordVecEnv* unpack_handle(PyObject* args) { + PyObject* handle_obj = PyTuple_GetItem(args, 0); + if (!PyLong_Check(handle_obj)) { + PyErr_SetString(PyExc_TypeError, "handle must be an integer"); + return NULL; + } + CliffordVecEnv* vec = (CliffordVecEnv*)PyLong_AsVoidPtr(handle_obj); + if (vec == NULL) { + PyErr_SetString(PyExc_ValueError, "invalid native Clifford handle"); + return NULL; + } + return vec; +} + +static PyObject* py_vec_init(PyObject* self, PyObject* args, PyObject* kwargs) { + (void)self; + if (PyTuple_Size(args) != 7) { + PyErr_SetString(PyExc_TypeError, "vec_init requires 7 positional arguments"); + return NULL; + } + + PyArrayObject *obs_arr = NULL, *act_arr = NULL, *rew_arr = NULL, *term_arr = NULL, *trunc_arr = NULL; + if (!require_contiguous_array(PyTuple_GetItem(args, 0), NPY_UINT8, 2, "observations", &obs_arr)) return NULL; + if (!require_contiguous_array(PyTuple_GetItem(args, 1), NPY_INT32, 1, "actions", &act_arr)) return NULL; + if (!require_contiguous_array(PyTuple_GetItem(args, 2), NPY_FLOAT32, 1, "rewards", &rew_arr)) return NULL; + if (!require_contiguous_array(PyTuple_GetItem(args, 3), NPY_BOOL, 1, "terminals", &term_arr)) return NULL; + if (!require_contiguous_array(PyTuple_GetItem(args, 4), NPY_BOOL, 1, "truncations", &trunc_arr)) return NULL; + + int num_envs = (int)PyLong_AsLong(PyTuple_GetItem(args, 5)); + int seed = (int)PyLong_AsLong(PyTuple_GetItem(args, 6)); + if (PyErr_Occurred()) { + return NULL; + } + if (num_envs <= 0) { + PyErr_SetString(PyExc_ValueError, "num_envs must be positive"); + return NULL; + } + + PyArrayObject *gate_kinds = NULL, *action_q0 = NULL, *action_q1 = NULL; + if (!require_contiguous_array(require_kwarg(kwargs, "gate_kinds"), NPY_INT32, 1, "gate_kinds", &gate_kinds)) return NULL; + if (!require_contiguous_array(require_kwarg(kwargs, "action_q0"), NPY_INT32, 1, "action_q0", &action_q0)) return NULL; + if (!require_contiguous_array(require_kwarg(kwargs, "action_q1"), NPY_INT32, 1, "action_q1", &action_q1)) return NULL; + + int num_actions = (int)PyArray_DIM(gate_kinds, 0); + if (num_actions <= 0 || PyArray_DIM(action_q0, 0) != num_actions || PyArray_DIM(action_q1, 0) != num_actions) { + PyErr_SetString(PyExc_ValueError, "action arrays must have the same positive length"); + return NULL; + } + + int n_qubits = (int)PyLong_AsLong(require_kwarg(kwargs, "n_qubits")); + double difficulty = PyFloat_AsDouble(require_kwarg(kwargs, "difficulty")); + int max_steps = (int)PyLong_AsLong(require_kwarg(kwargs, "max_steps")); + float single_qubit_cost = (float)PyFloat_AsDouble(require_kwarg(kwargs, "single_qubit_cost")); + float goal_bonus = (float)PyFloat_AsDouble(require_kwarg(kwargs, "goal_bonus")); + int reward_mode = (int)PyLong_AsLong(require_kwarg(kwargs, "reward_mode")); + float hamming_left_scale = (float)PyFloat_AsDouble(require_kwarg(kwargs, "hamming_left_scale")); + PyObject* reset_pool_enabled_obj = PyDict_GetItemString(kwargs, "use_reset_pool"); + int reset_pool_enabled = reset_pool_enabled_obj == NULL ? 1 : (int)PyObject_IsTrue(reset_pool_enabled_obj); + if (PyErr_Occurred()) { + return NULL; + } + if (n_qubits <= 0 || 2 * n_qubits > 64) { + PyErr_SetString(PyExc_ValueError, "native Clifford env requires 0 < n_qubits and 2*n_qubits <= 64"); + return NULL; + } + if (max_steps <= 0) { + PyErr_SetString(PyExc_ValueError, "max_steps must be positive"); + return NULL; + } + if (reward_mode != REWARD_GATE_COST && reward_mode != REWARD_HAMMING_LEFT) { + PyErr_SetString( + PyExc_ValueError, + "reward_mode must be gate_cost or hamming_left" + ); + return NULL; + } + if (hamming_left_scale < 0.0f) { + PyErr_SetString(PyExc_ValueError, "hamming_left_scale must be non-negative"); + return NULL; + } + + const int dim = 2 * n_qubits; + const int obs_size = dim * dim; + if (PyArray_DIM(obs_arr, 0) != num_envs || PyArray_DIM(obs_arr, 1) != obs_size) { + PyErr_SetString(PyExc_ValueError, "observations must have shape (num_envs, (2*n_qubits)^2)"); + return NULL; + } + if (PyArray_DIM(act_arr, 0) != num_envs || PyArray_DIM(rew_arr, 0) != num_envs || + PyArray_DIM(term_arr, 0) != num_envs || PyArray_DIM(trunc_arr, 0) != num_envs) { + PyErr_SetString(PyExc_ValueError, "action and buffer arrays must have length num_envs"); + return NULL; + } + + CliffordVecEnv* vec = (CliffordVecEnv*)calloc(1, sizeof(CliffordVecEnv)); + if (vec == NULL) { + return PyErr_NoMemory(); + } + vec->num_envs = num_envs; + vec->n_qubits = n_qubits; + vec->dim = dim; + vec->obs_size = obs_size; + set_difficulty_level(vec, difficulty); + vec->max_steps = max_steps; + vec->num_actions = num_actions; + vec->reset_pool_enabled = reset_pool_enabled; + vec->single_qubit_cost = single_qubit_cost; + vec->goal_bonus = goal_bonus; + vec->reward_mode = reward_mode; + vec->hamming_left_scale = hamming_left_scale; + init_observation_tables(); + + vec->actions = (CliffordAction*)calloc((size_t)num_actions, sizeof(CliffordAction)); + vec->envs = (CliffordEnv*)calloc((size_t)num_envs, sizeof(CliffordEnv)); + if (vec->actions == NULL || vec->envs == NULL) { + free(vec->actions); + free(vec->envs); + free(vec); + return PyErr_NoMemory(); + } + + int* gate_kind_ptr = (int*)PyArray_DATA(gate_kinds); + int* q0_ptr = (int*)PyArray_DATA(action_q0); + int* q1_ptr = (int*)PyArray_DATA(action_q1); + for (int i = 0; i < num_actions; ++i) { + vec->actions[i].gate_kind = gate_kind_ptr[i]; + vec->actions[i].q0 = q0_ptr[i]; + vec->actions[i].q1 = q1_ptr[i]; + } + for (int col = 0; col < vec->dim; ++col) { + vec->identity_cols[col] = 1ULL << col; + } + rebuild_reset_pool(vec, (uint64_t)(unsigned int)seed); + + unsigned char* obs_ptr = (unsigned char*)PyArray_DATA(obs_arr); + int* act_ptr = (int*)PyArray_DATA(act_arr); + float* rew_ptr = (float*)PyArray_DATA(rew_arr); + unsigned char* term_ptr = (unsigned char*)PyArray_DATA(term_arr); + unsigned char* trunc_ptr = (unsigned char*)PyArray_DATA(trunc_arr); + uint64_t seed_state = (uint64_t)(unsigned int)seed; + for (int env_idx = 0; env_idx < num_envs; ++env_idx) { + CliffordEnv* env = &vec->envs[env_idx]; + env->cols = (uint64_t*)calloc((size_t)vec->dim, sizeof(uint64_t)); + if (env->cols == NULL) { + for (int j = 0; j < env_idx; ++j) { + free(vec->envs[j].cols); + } + free(vec->reset_pool_cols); + free(vec->actions); + free(vec->envs); + free(vec); + return PyErr_NoMemory(); + } + env->observations = obs_ptr + (env_idx * vec->obs_size); + env->actions = act_ptr + env_idx; + env->rewards = rew_ptr + env_idx; + env->terminals = term_ptr + env_idx; + env->truncations = trunc_ptr + env_idx; + rng_seed(&env->rng, splitmix64_next(&seed_state) ^ (uint64_t)(env_idx + 1)); + reset_single(vec, env); + } + return PyLong_FromVoidPtr(vec); +} + +static PyObject* py_vec_reset(PyObject* self, PyObject* args) { + (void)self; + if (PyTuple_Size(args) != 2) { + PyErr_SetString(PyExc_TypeError, "vec_reset requires handle and seed"); + return NULL; + } + CliffordVecEnv* vec = unpack_handle(args); + if (vec == NULL) { + return NULL; + } + int seed = (int)PyLong_AsLong(PyTuple_GetItem(args, 1)); + if (PyErr_Occurred()) { + return NULL; + } + if (seed >= 0) { + rebuild_reset_pool(vec, (uint64_t)(unsigned int)seed); + } + uint64_t seed_state = (uint64_t)(unsigned int)seed; + for (int env_idx = 0; env_idx < vec->num_envs; ++env_idx) { + if (seed >= 0) { + rng_seed(&vec->envs[env_idx].rng, splitmix64_next(&seed_state) ^ (uint64_t)(env_idx + 1)); + } + vec->envs[env_idx].terminals[0] = 0; + vec->envs[env_idx].truncations[0] = 0; + vec->envs[env_idx].rewards[0] = 0.0f; + reset_single(vec, &vec->envs[env_idx]); + } + Py_RETURN_NONE; +} + +static PyObject* py_vec_step(PyObject* self, PyObject* args) { + (void)self; + if (PyTuple_Size(args) != 1) { + PyErr_SetString(PyExc_TypeError, "vec_step requires handle"); + return NULL; + } + CliffordVecEnv* vec = unpack_handle(args); + if (vec == NULL) { + return NULL; + } + for (int env_idx = 0; env_idx < vec->num_envs; ++env_idx) { + CliffordEnv* env = &vec->envs[env_idx]; + env->terminals[0] = 0; + env->truncations[0] = 0; + env->rewards[0] = 0.0f; + step_single(vec, env); + } + Py_RETURN_NONE; +} + +static PyObject* py_vec_close(PyObject* self, PyObject* args) { + (void)self; + if (PyTuple_Size(args) != 1) { + PyErr_SetString(PyExc_TypeError, "vec_close requires handle"); + return NULL; + } + CliffordVecEnv* vec = unpack_handle(args); + if (vec == NULL) { + return NULL; + } + for (int env_idx = 0; env_idx < vec->num_envs; ++env_idx) { + free(vec->envs[env_idx].cols); + } + free(vec->reset_pool_cols); + free(vec->actions); + free(vec->envs); + free(vec); + Py_RETURN_NONE; +} + +static PyObject* py_vec_set_difficulty(PyObject* self, PyObject* args) { + (void)self; + if (PyTuple_Size(args) != 2) { + PyErr_SetString(PyExc_TypeError, "vec_set_difficulty requires handle and difficulty"); + return NULL; + } + CliffordVecEnv* vec = unpack_handle(args); + if (vec == NULL) { + return NULL; + } + double difficulty = PyFloat_AsDouble(PyTuple_GetItem(args, 1)); + if (PyErr_Occurred()) { + return NULL; + } + set_difficulty_level(vec, difficulty); + rebuild_reset_pool(vec, rng_next_u64(&vec->pool_rng)); + Py_RETURN_NONE; +} + +static PyObject* py_vec_set_max_steps(PyObject* self, PyObject* args) { + (void)self; + if (PyTuple_Size(args) != 2) { + PyErr_SetString(PyExc_TypeError, "vec_set_max_steps requires handle and max_steps"); + return NULL; + } + CliffordVecEnv* vec = unpack_handle(args); + if (vec == NULL) { + return NULL; + } + vec->max_steps = (int)PyLong_AsLong(PyTuple_GetItem(args, 1)); + if (PyErr_Occurred()) { + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject* py_vec_set_matrix(PyObject* self, PyObject* args) { + (void)self; + if (PyTuple_Size(args) != 3) { + PyErr_SetString(PyExc_TypeError, "vec_set_matrix requires handle, env_index, matrix"); + return NULL; + } + CliffordVecEnv* vec = unpack_handle(args); + if (vec == NULL) { + return NULL; + } + int env_index = (int)PyLong_AsLong(PyTuple_GetItem(args, 1)); + if (PyErr_Occurred()) { + return NULL; + } + if (env_index < 0 || env_index >= vec->num_envs) { + PyErr_SetString(PyExc_IndexError, "env_index out of range"); + return NULL; + } + PyArrayObject* matrix = NULL; + if (!require_contiguous_array(PyTuple_GetItem(args, 2), NPY_UINT8, 2, "matrix", &matrix)) { + return NULL; + } + if (!set_matrix_from_dense( + vec, + &vec->envs[env_index], + (unsigned char*)PyArray_DATA(matrix), + (int)PyArray_DIM(matrix, 0), + (int)PyArray_DIM(matrix, 1))) { + PyErr_SetString(PyExc_ValueError, "matrix shape does not match native Clifford env"); + return NULL; + } + vec->envs[env_index].rewards[0] = 0.0f; + vec->envs[env_index].terminals[0] = 0; + vec->envs[env_index].truncations[0] = 0; + Py_RETURN_NONE; +} + +static PyObject* py_vec_log(PyObject* self, PyObject* args) { + (void)self; + if (PyTuple_Size(args) != 1) { + PyErr_SetString(PyExc_TypeError, "vec_log requires handle"); + return NULL; + } + CliffordVecEnv* vec = unpack_handle(args); + if (vec == NULL) { + return NULL; + } + PyObject* out = PyDict_New(); + if (vec->log.n <= 0.0f) { + return out; + } + const float inv_n = 1.0f / vec->log.n; + PyDict_SetItemString(out, "perf", PyFloat_FromDouble(vec->log.perf * inv_n)); + PyDict_SetItemString(out, "score", PyFloat_FromDouble(vec->log.score * inv_n)); + PyDict_SetItemString(out, "episode_return", PyFloat_FromDouble(vec->log.episode_return * inv_n)); + PyDict_SetItemString(out, "episode_length", PyFloat_FromDouble(vec->log.episode_length * inv_n)); + PyDict_SetItemString(out, "mean_cz", PyFloat_FromDouble(vec->log.episode_cz_sum * inv_n)); + PyDict_SetItemString(out, "success_rate", PyFloat_FromDouble(vec->log.success_rate * inv_n)); + PyDict_SetItemString(out, "n", PyFloat_FromDouble(vec->log.n)); + PyDict_SetItemString(out, "episode_count", PyFloat_FromDouble(vec->log.n)); + PyDict_SetItemString(out, "episode_return_sum", PyFloat_FromDouble(vec->log.episode_return)); + PyDict_SetItemString(out, "episode_length_sum", PyFloat_FromDouble(vec->log.episode_length)); + PyDict_SetItemString(out, "episode_cz_sum", PyFloat_FromDouble(vec->log.episode_cz_sum)); + PyDict_SetItemString(out, "episode_cz_max", PyFloat_FromDouble(vec->log.episode_cz_max)); + PyDict_SetItemString(out, "success_count", PyFloat_FromDouble(vec->log.success_count)); + PyDict_SetItemString(out, "success_step_sum", PyFloat_FromDouble(vec->log.success_step_sum)); + PyDict_SetItemString(out, "success_step_sq_sum", PyFloat_FromDouble(vec->log.success_step_sq_sum)); + PyDict_SetItemString( + out, + "mean_success_cz", + PyFloat_FromDouble(vec->log.success_count > 0.0f ? (vec->log.success_cz_sum / vec->log.success_count) : 0.0) + ); + PyDict_SetItemString(out, "success_cz_sum", PyFloat_FromDouble(vec->log.success_cz_sum)); + PyDict_SetItemString(out, "success_cz_max", PyFloat_FromDouble(vec->log.success_cz_max)); + PyDict_SetItemString(out, "success_step_min", PyFloat_FromDouble(vec->log.success_step_min)); + PyDict_SetItemString(out, "success_step_max", PyFloat_FromDouble(vec->log.success_step_max)); + memset(&vec->log, 0, sizeof(CliffordLog)); + return out; +} + +static PyMethodDef MODULE_METHODS[] = { + {"vec_init", (PyCFunction)py_vec_init, METH_VARARGS | METH_KEYWORDS, "Create a native Clifford vec env"}, + {"vec_reset", py_vec_reset, METH_VARARGS, "Reset the native Clifford vec env"}, + {"vec_step", py_vec_step, METH_VARARGS, "Step the native Clifford vec env"}, + {"vec_close", py_vec_close, METH_VARARGS, "Close the native Clifford vec env"}, + {"vec_log", py_vec_log, METH_VARARGS, "Aggregate native Clifford logs"}, + {"vec_set_difficulty", py_vec_set_difficulty, METH_VARARGS, "Update native Clifford difficulty"}, + {"vec_set_max_steps", py_vec_set_max_steps, METH_VARARGS, "Update native Clifford max_steps"}, + {"vec_set_matrix", py_vec_set_matrix, METH_VARARGS, "Set the matrix for one native Clifford env"}, + {NULL, NULL, 0, NULL}, +}; + +static struct PyModuleDef MODULE_DEF = { + PyModuleDef_HEAD_INIT, + "binding", + "Native Clifford environment binding", + -1, + MODULE_METHODS, +}; + +PyMODINIT_FUNC PyInit_binding(void) { + import_array(); + return PyModule_Create(&MODULE_DEF); +} diff --git a/pufferlib/ocean/clifford/clifford.h b/pufferlib/ocean/clifford/clifford.h new file mode 100644 index 0000000000..6ea4324e0f --- /dev/null +++ b/pufferlib/ocean/clifford/clifford.h @@ -0,0 +1,579 @@ +#ifndef PUFFERLIB_OCEAN_CLIFFORD_CLIFFORD_H +#define PUFFERLIB_OCEAN_CLIFFORD_CLIFFORD_H + +#include +#include +#include +#include + +#define GATE_H 0 +#define GATE_S 1 +#define GATE_V 2 +#define GATE_HS 3 +#define GATE_HV 4 +#define GATE_CZ 5 + +#define REWARD_GATE_COST 0 +#define REWARD_HAMMING_LEFT 1 + +typedef struct { + int gate_kind; + int q0; + int q1; +} CliffordAction; + +typedef struct { + float perf; + float score; + float episode_return; + float episode_length; + float episode_cz_sum; + float episode_cz_max; + float success_rate; + float n; + float success_count; + float success_step_sum; + float success_step_sq_sum; + float success_cz_sum; + float success_cz_max; + float success_step_min; + float success_step_max; +} CliffordLog; + +typedef struct { + uint64_t state; + double next_gaussian; + int has_gaussian; +} XorShift64; + +typedef struct { + uint64_t* cols; + unsigned char* observations; + int* actions; + float* rewards; + unsigned char* terminals; + unsigned char* truncations; + float episode_return; + int episode_length; + int episode_cz_count; + int steps; + int episode_max_steps; + XorShift64 rng; +} CliffordEnv; + +typedef struct { + CliffordEnv* envs; + int num_envs; + int n_qubits; + int dim; + int obs_size; + int difficulty; + float difficulty_fraction; + int max_steps; + int num_actions; + CliffordAction* actions; + int reset_pool_enabled; + uint64_t* reset_pool_cols; + int reset_pool_size; + int reset_pool_walk_steps; + int reset_tail_steps; + int reset_refresh_stride; + int reset_refresh_credit; + XorShift64 pool_rng; + uint64_t identity_cols[64]; + float single_qubit_cost; + float goal_bonus; + int reward_mode; + float hamming_left_scale; + CliffordLog log; +} CliffordVecEnv; + +static uint64_t BYTE_TO_BYTES64[256]; +static uint64_t ROWMASK_ACCUM[8][256]; +static int OBS_TABLES_READY = 0; + +static inline uint64_t splitmix64_next(uint64_t* state) { + uint64_t z = (*state += 0x9E3779B97F4A7C15ULL); + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL; + return z ^ (z >> 31); +} + +static inline void rng_seed(XorShift64* rng, uint64_t seed) { + if (seed == 0) { + seed = 0x123456789ABCDEFULL; + } + rng->state = seed; + rng->next_gaussian = 0.0; + rng->has_gaussian = 0; +} + +static inline uint64_t rng_next_u64(XorShift64* rng) { + uint64_t x = rng->state; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + rng->state = x; + return x * 2685821657736338717ULL; +} + +static inline int rng_below(XorShift64* rng, int upper) { + if (upper <= 1) { + return 0; + } + return (int)(rng_next_u64(rng) % (uint64_t)upper); +} + +static inline int popcount_u64(uint64_t value) { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_popcountll((unsigned long long)value); +#else + int count = 0; + while (value != 0ULL) { + value &= value - 1ULL; + count += 1; + } + return count; +#endif +} + +static inline float rng_float01(XorShift64* rng) { + return (float)((rng_next_u64(rng) >> 40) * (1.0 / 16777216.0)); +} + +static inline void set_difficulty_level(CliffordVecEnv* vec, double difficulty_level) { + if (difficulty_level < 0.0) { + difficulty_level = 0.0; + } + double floor_level = floor(difficulty_level + 1e-12); + vec->difficulty = (int)floor_level; + vec->difficulty_fraction = (float)(difficulty_level - floor_level); + if (vec->difficulty_fraction <= 1e-6f) { + vec->difficulty_fraction = 0.0f; + } else if (vec->difficulty_fraction >= 1.0f - 1e-6f) { + vec->difficulty += 1; + vec->difficulty_fraction = 0.0f; + } +} + +static inline int effective_reset_difficulty(const CliffordVecEnv* vec) { + return vec->difficulty + (vec->difficulty_fraction > 0.0f ? 1 : 0); +} + +static inline int sample_reset_difficulty(const CliffordVecEnv* vec, XorShift64* rng) { + if (vec->difficulty_fraction <= 0.0f) { + return vec->difficulty; + } + return vec->difficulty + (rng_float01(rng) < vec->difficulty_fraction ? 1 : 0); +} + +static inline void init_observation_tables(void) { + if (OBS_TABLES_READY) { + return; + } + for (int value = 0; value < 256; ++value) { + uint64_t expanded = 0ULL; + for (int bit = 0; bit < 8; ++bit) { + expanded |= (uint64_t)((value >> bit) & 1U) << (bit * 8); + } + BYTE_TO_BYTES64[value] = expanded; + } + for (int col = 0; col < 8; ++col) { + const uint64_t row_bit = (uint64_t)(1U << col); + for (int value = 0; value < 256; ++value) { + uint64_t packed = 0ULL; + for (int row = 0; row < 8; ++row) { + if ((value >> row) & 1U) { + packed |= row_bit << (row * 8); + } + } + ROWMASK_ACCUM[col][value] = packed; + } + } + OBS_TABLES_READY = 1; +} + +static inline void copy_identity_cols(const CliffordVecEnv* vec, CliffordEnv* env) { + memcpy(env->cols, vec->identity_cols, (size_t)vec->dim * sizeof(uint64_t)); +} + +static inline void copy_cols(const CliffordVecEnv* vec, uint64_t* dst, const uint64_t* src) { + memcpy(dst, src, (size_t)vec->dim * sizeof(uint64_t)); +} + +static inline void reset_episode_state(const CliffordVecEnv* vec, CliffordEnv* env) { + env->steps = 0; + env->episode_max_steps = vec->max_steps; + env->episode_return = 0.0f; + env->episode_length = 0; + env->episode_cz_count = 0; +} + +static inline int is_identity(const CliffordVecEnv* vec, const CliffordEnv* env) { + for (int col = 0; col < vec->dim; ++col) { + if (env->cols[col] != vec->identity_cols[col]) { + return 0; + } + } + return 1; +} + +static inline int sample_action(const CliffordVecEnv* vec, XorShift64* rng) { + if (vec->num_actions <= 0) { + return -1; + } + return rng_below(rng, vec->num_actions); +} + +static inline void apply_action(const CliffordVecEnv* vec, CliffordEnv* env, int action_idx) { + const CliffordAction* action = &vec->actions[action_idx]; + const int n = vec->n_qubits; + const int q = action->q0; + const uint64_t x = env->cols[q]; + const uint64_t z = env->cols[n + q]; + if (action->gate_kind == GATE_H) { + env->cols[q] = z; + env->cols[n + q] = x; + } else if (action->gate_kind == GATE_S) { + env->cols[q] = x; + env->cols[n + q] = z ^ x; + } else if (action->gate_kind == GATE_V) { + env->cols[q] = x ^ z; + env->cols[n + q] = z; + } else if (action->gate_kind == GATE_HS) { + env->cols[q] = z; + env->cols[n + q] = x ^ z; + } else if (action->gate_kind == GATE_HV) { + env->cols[q] = x ^ z; + env->cols[n + q] = x; + } else { + env->cols[n + action->q0] ^= env->cols[action->q1]; + env->cols[n + action->q1] ^= env->cols[action->q0]; + } +} + +static inline int compute_reset_tail_steps(int difficulty) { + if (difficulty < 8) { + return 0; + } + int tail = difficulty / 4; + if (tail < 8) { + tail = 8; + } + if (tail > 32) { + tail = 32; + } + if (tail >= difficulty) { + tail = difficulty / 2; + } + if (tail < 1) { + tail = 1; + } + return tail; +} + +static inline int floor_power_of_two(int value) { + int power = 1; + while ((power << 1) > 0 && (power << 1) <= value) { + power <<= 1; + } + return power; +} + +static inline int compute_reset_pool_size(const CliffordVecEnv* vec) { + if (!vec->reset_pool_enabled) { + return 0; + } + if (vec->difficulty_fraction > 0.0f) { + return 0; + } + if (vec->difficulty < 8) { + return 0; + } + const size_t bytes_per_entry = (size_t)vec->dim * sizeof(uint64_t); + const size_t target_bytes = 16u << 20; + int desired = vec->num_envs * 8; + if (desired < 2048) { + desired = 2048; + } + if (desired > 32768) { + desired = 32768; + } + if (bytes_per_entry > 0) { + const size_t cap_by_memory = target_bytes / bytes_per_entry; + if (cap_by_memory > 0 && (size_t)desired > cap_by_memory) { + desired = (int)cap_by_memory; + } + } + if (desired < 256) { + desired = 256; + } + return floor_power_of_two(desired); +} + +static inline void generate_random_walk_cols( + const CliffordVecEnv* vec, + XorShift64* rng, + int walk_steps, + uint64_t* out_cols +) { + CliffordEnv env = { + .cols = out_cols, + }; + if (walk_steps <= 0) { + copy_identity_cols(vec, &env); + return; + } + do { + copy_identity_cols(vec, &env); + for (int step = 0; step < walk_steps; ++step) { + const int action_idx = sample_action(vec, rng); + if (action_idx < 0) { + break; + } + apply_action(vec, &env, action_idx); + } + } while (is_identity(vec, &env)); +} + +static inline void rebuild_reset_pool(CliffordVecEnv* vec, uint64_t seed) { + free(vec->reset_pool_cols); + vec->reset_pool_cols = NULL; + const int pool_difficulty = effective_reset_difficulty(vec); + if (!vec->reset_pool_enabled) { + vec->reset_pool_size = 0; + vec->reset_pool_walk_steps = 0; + vec->reset_tail_steps = 0; + vec->reset_refresh_stride = 0; + vec->reset_refresh_credit = 0; + rng_seed(&vec->pool_rng, seed ^ 0x9E3779B97F4A7C15ULL ^ (uint64_t)pool_difficulty); + return; + } + vec->reset_pool_size = compute_reset_pool_size(vec); + vec->reset_tail_steps = compute_reset_tail_steps(pool_difficulty); + vec->reset_pool_walk_steps = pool_difficulty - vec->reset_tail_steps; + vec->reset_refresh_stride = 32; + vec->reset_refresh_credit = 0; + rng_seed(&vec->pool_rng, seed ^ 0x9E3779B97F4A7C15ULL ^ (uint64_t)pool_difficulty); + if (vec->reset_pool_size <= 0 || vec->reset_pool_walk_steps <= 0) { + vec->reset_pool_size = 0; + vec->reset_pool_walk_steps = 0; + vec->reset_tail_steps = 0; + return; + } + vec->reset_pool_cols = (uint64_t*)calloc( + (size_t)vec->reset_pool_size * (size_t)vec->dim, + sizeof(uint64_t) + ); + if (vec->reset_pool_cols == NULL) { + vec->reset_pool_size = 0; + vec->reset_pool_walk_steps = 0; + vec->reset_tail_steps = 0; + return; + } + for (int slot = 0; slot < vec->reset_pool_size; ++slot) { + generate_random_walk_cols( + vec, + &vec->pool_rng, + vec->reset_pool_walk_steps, + vec->reset_pool_cols + ((size_t)slot * (size_t)vec->dim) + ); + } +} + +static inline void maybe_refresh_reset_pool(CliffordVecEnv* vec) { + if (vec->reset_pool_cols == NULL || vec->reset_pool_size <= 0 || vec->reset_pool_walk_steps <= 0) { + return; + } + vec->reset_refresh_credit += 1; + if (vec->reset_refresh_credit < vec->reset_refresh_stride) { + return; + } + vec->reset_refresh_credit -= vec->reset_refresh_stride; + const int slot = (int)(rng_next_u64(&vec->pool_rng) & (uint64_t)(vec->reset_pool_size - 1)); + generate_random_walk_cols( + vec, + &vec->pool_rng, + vec->reset_pool_walk_steps, + vec->reset_pool_cols + ((size_t)slot * (size_t)vec->dim) + ); +} + +static inline void write_observation(const CliffordVecEnv* vec, CliffordEnv* env) { + const int dim = vec->dim; + int row_block = 0; + for (; row_block + 8 <= dim; row_block += 8) { + int col_block = 0; + for (; col_block + 8 <= dim; col_block += 8) { + uint64_t row_masks = 0ULL; + for (int col = 0; col < 8; ++col) { + const uint8_t column_chunk = (uint8_t)((env->cols[col_block + col] >> row_block) & 0xFFU); + row_masks |= ROWMASK_ACCUM[col][column_chunk]; + } + for (int row = 0; row < 8; ++row) { + const uint8_t row_mask = (uint8_t)((row_masks >> (row * 8)) & 0xFFU); + const uint64_t expanded = BYTE_TO_BYTES64[row_mask]; + memcpy(env->observations + ((row_block + row) * dim) + col_block, &expanded, 8); + } + } + for (int row = 0; row < 8; ++row) { + const int dst_row = row_block + row; + for (int col = col_block; col < dim; ++col) { + env->observations[dst_row * dim + col] = (unsigned char)((env->cols[col] >> dst_row) & 1ULL); + } + } + } + for (int row = row_block; row < dim; ++row) { + for (int col = 0; col < dim; ++col) { + env->observations[row * dim + col] = (unsigned char)((env->cols[col] >> row) & 1ULL); + } + } +} + +static inline void reset_single(CliffordVecEnv* vec, CliffordEnv* env) { + reset_episode_state(vec, env); + const int reset_difficulty = sample_reset_difficulty(vec, &env->rng); + if (reset_difficulty <= 0) { + copy_identity_cols(vec, env); + write_observation(vec, env); + return; + } + + if (vec->reset_pool_cols != NULL && vec->reset_pool_size > 0) { + do { + maybe_refresh_reset_pool(vec); + const int slot = (int)(rng_next_u64(&env->rng) & (uint64_t)(vec->reset_pool_size - 1)); + copy_cols( + vec, + env->cols, + vec->reset_pool_cols + ((size_t)slot * (size_t)vec->dim) + ); + for (int step = 0; step < vec->reset_tail_steps; ++step) { + const int action_idx = sample_action(vec, &env->rng); + if (action_idx < 0) { + break; + } + apply_action(vec, env, action_idx); + } + } while (is_identity(vec, env)); + write_observation(vec, env); + return; + } + + do { + copy_identity_cols(vec, env); + for (int step = 0; step < reset_difficulty; ++step) { + const int action_idx = sample_action(vec, &env->rng); + if (action_idx < 0) { + break; + } + apply_action(vec, env, action_idx); + } + } while (is_identity(vec, env)); + + write_observation(vec, env); +} + +static inline int set_matrix_from_dense(CliffordVecEnv* vec, CliffordEnv* env, const unsigned char* data, int rows, int cols) { + if (rows != vec->dim || cols != vec->dim) { + return 0; + } + for (int col = 0; col < vec->dim; ++col) { + uint64_t packed = 0ULL; + for (int row = 0; row < vec->dim; ++row) { + packed |= ((uint64_t)(data[row * cols + col] & 1U)) << row; + } + env->cols[col] = packed; + } + reset_episode_state(vec, env); + write_observation(vec, env); + return 1; +} + +static inline int identity_hamming_distance(const CliffordVecEnv* vec, const CliffordEnv* env) { + int distance = 0; + for (int col = 0; col < vec->dim; ++col) { + distance += popcount_u64(env->cols[col] ^ vec->identity_cols[col]); + } + return distance; +} + +static inline float normalized_identity_hamming_distance(const CliffordVecEnv* vec, const CliffordEnv* env) { + return (float)identity_hamming_distance(vec, env) / (float)vec->obs_size; +} + +static inline void add_log(CliffordVecEnv* vec, CliffordEnv* env, int success) { + vec->log.perf += success ? 1.0f : 0.0f; + vec->log.score += env->episode_return; + vec->log.episode_return += env->episode_return; + vec->log.episode_length += (float)env->episode_length; + vec->log.episode_cz_sum += (float)env->episode_cz_count; + if ((float)env->episode_cz_count > vec->log.episode_cz_max) { + vec->log.episode_cz_max = (float)env->episode_cz_count; + } + vec->log.success_rate += success ? 1.0f : 0.0f; + vec->log.n += 1.0f; + if (success) { + vec->log.success_count += 1.0f; + vec->log.success_step_sum += (float)env->episode_length; + vec->log.success_step_sq_sum += (float)(env->episode_length * env->episode_length); + vec->log.success_cz_sum += (float)env->episode_cz_count; + if ((float)env->episode_cz_count > vec->log.success_cz_max) { + vec->log.success_cz_max = (float)env->episode_cz_count; + } + if (vec->log.success_count <= 1.0f) { + vec->log.success_step_min = (float)env->episode_length; + vec->log.success_step_max = (float)env->episode_length; + } else { + if ((float)env->episode_length < vec->log.success_step_min) { + vec->log.success_step_min = (float)env->episode_length; + } + if ((float)env->episode_length > vec->log.success_step_max) { + vec->log.success_step_max = (float)env->episode_length; + } + } + } +} + +static inline float gate_cost_reward(const CliffordVecEnv* vec, int gate_kind) { + return gate_kind == GATE_CZ ? -1.0f : -vec->single_qubit_cost; +} + +static inline float step_single(CliffordVecEnv* vec, CliffordEnv* env) { + int action_idx = env->actions[0] % vec->num_actions; + if (action_idx < 0) { + action_idx += vec->num_actions; + } + const CliffordAction* action = &vec->actions[action_idx]; + const int gate_kind = action->gate_kind; + apply_action(vec, env, action_idx); + env->steps += 1; + env->episode_length += 1; + if (gate_kind == GATE_CZ) { + env->episode_cz_count += 1; + } + + int terminated = is_identity(vec, env); + int truncated = (!terminated && env->steps >= env->episode_max_steps); + float reward = gate_cost_reward(vec, gate_kind); + if (vec->reward_mode == REWARD_HAMMING_LEFT) { + reward -= vec->hamming_left_scale * normalized_identity_hamming_distance(vec, env); + } + if (terminated) { + reward += vec->goal_bonus; + } + env->episode_return += reward; + env->rewards[0] = reward; + env->terminals[0] = (unsigned char)terminated; + env->truncations[0] = (unsigned char)truncated; + + if (terminated || truncated) { + add_log(vec, env, terminated); + reset_single(vec, env); + } else { + write_observation(vec, env); + } + return reward; +} + +#endif diff --git a/pufferlib/ocean/clifford/clifford.py b/pufferlib/ocean/clifford/clifford.py new file mode 100644 index 0000000000..62adf023b3 --- /dev/null +++ b/pufferlib/ocean/clifford/clifford.py @@ -0,0 +1,205 @@ +import gymnasium +import numpy as np + +import pufferlib +from . import binding +from .reference import _build_actions, _is_symplectic + +_GATE_KIND = { + "h": 0, + "s": 1, + "v": 2, + "hs": 3, + "hv": 4, + "cz": 5, +} +_REWARD_MODE = { + "gate_cost": 0, + "hamming_left": 1, +} + + +class Clifford(pufferlib.PufferEnv): + def __init__( + self, + num_envs=1, + n_qubits=6, + difficulty=10, + max_steps=200, + single_qubit_cost=0.01, + goal_bonus=0.0, + reward_mode="gate_cost", + hamming_left_scale=0.5, + use_reset_pool=True, + log_interval=128, + buf=None, + seed=0, + render_mode=None, + ): + if n_qubits <= 0: + raise ValueError("n_qubits must be positive") + if 2 * n_qubits > 64: + raise ValueError("native Clifford env requires 2*n_qubits <= 64") + if num_envs <= 0: + raise ValueError("num_envs must be positive") + if difficulty < 0: + raise ValueError("difficulty must be non-negative") + if max_steps <= 0: + raise ValueError("max_steps must be positive") + if goal_bonus < 0.0: + raise ValueError("goal_bonus must be non-negative") + if reward_mode not in _REWARD_MODE: + raise ValueError(f"reward_mode must be one of {sorted(_REWARD_MODE)}") + if hamming_left_scale < 0.0: + raise ValueError("hamming_left_scale must be non-negative") + + self.n_qubits = int(n_qubits) + self.dim = 2 * self.n_qubits + self._actions = _build_actions(self.n_qubits) + self.log_interval = int(log_interval) + self.render_mode = render_mode + self.num_agents = int(num_envs) + self.agents_per_batch = self.num_agents + + obs_size = self.dim * self.dim + self.single_observation_space = gymnasium.spaces.Box( + low=0, + high=1, + shape=(obs_size,), + dtype=np.uint8, + ) + self.single_action_space = gymnasium.spaces.Discrete(len(self._actions)) + super().__init__(buf) + + gate_kinds = np.empty(len(self._actions), dtype=np.int32) + action_q0 = np.empty(len(self._actions), dtype=np.int32) + action_q1 = np.empty(len(self._actions), dtype=np.int32) + for idx, (gate, q0, q1) in enumerate(self._actions): + gate_kinds[idx] = _GATE_KIND[gate] + action_q0[idx] = q0 + action_q1[idx] = q1 + + self.tick = 0 + self.c_envs = binding.vec_init( + self.observations, + self.actions, + self.rewards, + self.terminals, + self.truncations, + self.num_agents, + int(seed), + n_qubits=self.n_qubits, + difficulty=float(difficulty), + max_steps=int(max_steps), + gate_kinds=gate_kinds, + action_q0=action_q0, + action_q1=action_q1, + single_qubit_cost=float(single_qubit_cost), + goal_bonus=float(goal_bonus), + reward_mode=int(_REWARD_MODE[reward_mode]), + hamming_left_scale=float(hamming_left_scale), + use_reset_pool=bool(use_reset_pool), + ) + + def reset(self, seed=0): + self.tick = 0 + binding.vec_reset(self.c_envs, -1 if seed is None else int(seed)) + return self.observations, [] + + def step(self, actions): + actions = np.asarray(actions, dtype=np.int32) + if actions.shape != (self.num_agents,): + raise ValueError(f"Expected actions with shape ({self.num_agents},), got {actions.shape}") + + self.actions[:] = actions + binding.vec_step(self.c_envs) + self.tick += 1 + + info = [] + if self.tick % self.log_interval == 0: + metrics = binding.vec_log(self.c_envs) + if metrics: + info.append(metrics) + + return ( + self.observations, + self.rewards, + self.terminals, + self.truncations, + info, + ) + + def close(self): + if getattr(self, "c_envs", None) is not None: + binding.vec_close(self.c_envs) + self.c_envs = None + + def flush_logs(self): + metrics = binding.vec_log(self.c_envs) + return metrics or None + + def set_difficulty(self, difficulty): + difficulty = float(difficulty) + if difficulty < 0.0: + raise ValueError("difficulty must be non-negative") + binding.vec_set_difficulty(self.c_envs, difficulty) + + def set_max_steps(self, max_steps): + max_steps = int(max_steps) + if max_steps <= 0: + raise ValueError("max_steps must be positive") + binding.vec_set_max_steps(self.c_envs, max_steps) + + def set_matrix(self, matrix, env_index=0): + matrix_u8 = np.ascontiguousarray(np.asarray(matrix, dtype=np.uint8)) + expected_shape = (self.dim, self.dim) + if matrix_u8.shape != expected_shape: + raise ValueError(f"Expected matrix shape {expected_shape}, got {matrix_u8.shape}") + if not _is_symplectic(matrix_u8): + raise ValueError("matrix must be symplectic") + binding.vec_set_matrix(self.c_envs, int(env_index), matrix_u8) + + +def test_performance( + timeout=10, + atn_cache=1024, + num_envs=2048, + n_qubits=6, + difficulty=10, + max_steps=200, + use_reset_pool=True, +): + env = Clifford( + num_envs=num_envs, + n_qubits=n_qubits, + difficulty=difficulty, + max_steps=max_steps, + use_reset_pool=use_reset_pool, + ) + env.reset() + tick = 0 + + actions = np.random.randint( + 0, + env.single_action_space.n, + (atn_cache, num_envs), + dtype=np.int32, + ) + + import time + start = time.time() + while time.time() - start < timeout: + env.step(actions[tick % atn_cache]) + tick += 1 + + sps = num_envs * tick / (time.time() - start) + print( + f"Clifford SPS: {sps:,.0f} " + f"(num_envs={num_envs}, n_qubits={n_qubits}, difficulty={difficulty}, " + f"max_steps={max_steps}, use_reset_pool={use_reset_pool})" + ) + env.close() + + +if __name__ == "__main__": + test_performance() diff --git a/pufferlib/ocean/clifford/reference.py b/pufferlib/ocean/clifford/reference.py new file mode 100644 index 0000000000..280b2fa345 --- /dev/null +++ b/pufferlib/ocean/clifford/reference.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +from typing import Any + +import gymnasium +import numpy as np + +Action = tuple[str, int, int] +SINGLE_QUBIT_GATES = ("h", "s", "v", "hs", "hv") +REWARD_MODES = {"gate_cost", "hamming_left"} + + +def _build_actions(n_qubits: int) -> list[Action]: + actions: list[Action] = [] + for gate in SINGLE_QUBIT_GATES: + for qubit in range(n_qubits): + actions.append((gate, qubit, -1)) + + for src in range(n_qubits): + for dst in range(src + 1, n_qubits): + actions.append(("cz", src, dst)) + + return actions + + +def _identity_symplectic(n_qubits: int) -> np.ndarray: + return np.eye(2 * n_qubits, dtype=np.uint8) + + +def _symplectic_form(n_qubits: int) -> np.ndarray: + omega = np.zeros((2 * n_qubits, 2 * n_qubits), dtype=np.uint8) + eye = np.eye(n_qubits, dtype=np.uint8) + omega[:n_qubits, n_qubits:] = eye + omega[n_qubits:, :n_qubits] = eye + return omega + + +def _is_symplectic(matrix: np.ndarray) -> bool: + matrix_u8 = np.asarray(matrix, dtype=np.uint8) + if matrix_u8.ndim != 2 or matrix_u8.shape[0] != matrix_u8.shape[1]: + return False + if matrix_u8.shape[0] % 2 != 0: + return False + n_qubits = matrix_u8.shape[0] // 2 + omega = _symplectic_form(n_qubits) + lhs = (matrix_u8.T @ omega @ matrix_u8) % 2 + return bool(np.array_equal(lhs.astype(np.uint8), omega)) + + +def _identity_hamming_distance(matrix: np.ndarray) -> int: + identity = np.eye(matrix.shape[0], dtype=np.uint8) + return int(np.count_nonzero(matrix != identity)) + + +def _normalized_identity_hamming_distance(matrix: np.ndarray) -> float: + return float(_identity_hamming_distance(matrix) / matrix.size) + + +def _xor_columns_inplace(matrix: np.ndarray, dst_idx: int, src_col: np.ndarray) -> None: + np.bitwise_xor(matrix[:, dst_idx], src_col, out=matrix[:, dst_idx]) + + +def apply_h_inplace(matrix: np.ndarray, qubit: int) -> None: + n_qubits = matrix.shape[0] // 2 + z_col = n_qubits + qubit + matrix[:, [qubit, z_col]] = matrix[:, [z_col, qubit]] + + +def apply_s_inplace(matrix: np.ndarray, qubit: int) -> None: + n_qubits = matrix.shape[0] // 2 + _xor_columns_inplace(matrix, n_qubits + qubit, matrix[:, qubit].copy()) + + +def apply_v_inplace(matrix: np.ndarray, qubit: int) -> None: + apply_s_inplace(matrix, qubit) + apply_h_inplace(matrix, qubit) + apply_s_inplace(matrix, qubit) + + +def apply_hs_inplace(matrix: np.ndarray, qubit: int) -> None: + apply_h_inplace(matrix, qubit) + apply_s_inplace(matrix, qubit) + + +def apply_hv_inplace(matrix: np.ndarray, qubit: int) -> None: + apply_h_inplace(matrix, qubit) + apply_v_inplace(matrix, qubit) + + +def apply_cz_inplace(matrix: np.ndarray, src: int, dst: int) -> None: + if src == dst: + raise ValueError("CZ requires distinct qubits") + n_qubits = matrix.shape[0] // 2 + src_x = matrix[:, src].copy() + dst_x = matrix[:, dst].copy() + _xor_columns_inplace(matrix, n_qubits + src, dst_x) + _xor_columns_inplace(matrix, n_qubits + dst, src_x) + + +class ReferenceCliffordEnv(gymnasium.Env): + metadata = {"render_modes": []} + + def __init__( + self, + n_qubits: int = 6, + difficulty: int = 10, + max_steps: int = 200, + single_qubit_cost: float = 0.01, + goal_bonus: float = 0.0, + reward_mode: str = "gate_cost", + hamming_left_scale: float = 0.5, + render_mode=None, + ): + super().__init__() + if n_qubits <= 0: + raise ValueError("n_qubits must be positive") + if difficulty < 0: + raise ValueError("difficulty must be non-negative") + if max_steps <= 0: + raise ValueError("max_steps must be positive") + if goal_bonus < 0.0: + raise ValueError("goal_bonus must be non-negative") + if reward_mode not in REWARD_MODES: + raise ValueError(f"reward_mode must be one of {sorted(REWARD_MODES)}") + if hamming_left_scale < 0.0: + raise ValueError("hamming_left_scale must be non-negative") + + self.n_qubits = int(n_qubits) + self.difficulty = int(difficulty) + self.max_steps = int(max_steps) + self.single_qubit_cost = float(single_qubit_cost) + self.goal_bonus = float(goal_bonus) + self.reward_mode = str(reward_mode) + self.hamming_left_scale = float(hamming_left_scale) + self.render_mode = render_mode + self._actions = _build_actions(self.n_qubits) + self._identity = _identity_symplectic(self.n_qubits) + self._matrix = self._identity.copy() + self.steps = 0 + + obs_size = self._identity.size + self.action_space = gymnasium.spaces.Discrete(len(self._actions)) + self.observation_space = gymnasium.spaces.Box( + low=0, + high=1, + shape=(obs_size,), + dtype=np.uint8, + ) + + def _get_obs(self) -> np.ndarray: + return self._matrix.reshape(-1).copy() + + def set_difficulty(self, difficulty: int) -> None: + difficulty = int(difficulty) + if difficulty < 0: + raise ValueError("difficulty must be non-negative") + self.difficulty = difficulty + + def set_max_steps(self, max_steps: int) -> None: + max_steps = int(max_steps) + if max_steps <= 0: + raise ValueError("max_steps must be positive") + self.max_steps = max_steps + + def set_matrix(self, matrix: np.ndarray) -> None: + matrix_u8 = np.ascontiguousarray(np.asarray(matrix, dtype=np.uint8)) + expected_shape = (2 * self.n_qubits, 2 * self.n_qubits) + if matrix_u8.shape != expected_shape: + raise ValueError(f"Expected matrix shape {expected_shape}, got {matrix_u8.shape}") + if not _is_symplectic(matrix_u8): + raise ValueError("matrix must be symplectic") + self._matrix = matrix_u8.copy() + self.steps = 0 + + def _sample_action(self) -> int: + return int(self.np_random.integers(len(self._actions))) + + def _apply_action(self, action_idx: int) -> str: + gate, q0, q1 = self._actions[action_idx] + if gate == "h": + apply_h_inplace(self._matrix, q0) + elif gate == "s": + apply_s_inplace(self._matrix, q0) + elif gate == "v": + apply_v_inplace(self._matrix, q0) + elif gate == "hs": + apply_hs_inplace(self._matrix, q0) + elif gate == "hv": + apply_hv_inplace(self._matrix, q0) + elif gate == "cz": + apply_cz_inplace(self._matrix, q0, q1) + else: + raise ValueError(f"Unknown gate {gate}") + return gate + + def reset( + self, + *, + seed: int | None = None, + options: dict[str, Any] | None = None, + ) -> tuple[np.ndarray, dict]: + super().reset(seed=seed) + if options and "matrix" in options: + self.set_matrix(options["matrix"]) + return self._get_obs(), {} + + if self.difficulty <= 0: + self._matrix = self._identity.copy() + self.steps = 0 + return self._get_obs(), {} + + while True: + self._matrix = self._identity.copy() + for _ in range(self.difficulty): + self._apply_action(self._sample_action()) + if not np.array_equal(self._matrix, self._identity): + break + + self.steps = 0 + return self._get_obs(), {} + + def step(self, action: int) -> tuple[np.ndarray, float, bool, bool, dict]: + action_idx = int(action) % len(self._actions) + gate = self._apply_action(action_idx) + self.steps += 1 + + terminated = bool(np.array_equal(self._matrix, self._identity)) + truncated = bool(not terminated and self.steps >= self.max_steps) + reward = -1.0 if gate == "cz" else -self.single_qubit_cost + if self.reward_mode == "hamming_left": + reward -= self.hamming_left_scale * _normalized_identity_hamming_distance(self._matrix) + if terminated: + reward += self.goal_bonus + + return self._get_obs(), float(reward), terminated, truncated, {} + + +__all__ = [ + "ReferenceCliffordEnv", +] diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 6c56a4ea20..f3d355ae46 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -120,6 +120,7 @@ def make_multiagent(buf=None, **kwargs): 'battle': 'Battle', 'breakout': 'Breakout', 'blastar': 'Blastar', + 'clifford': 'Clifford', 'convert': 'Convert', 'convert_circle': 'ConvertCircle', 'pong': 'Pong', diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index 7132a19f90..bf8c67da47 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -28,7 +28,6 @@ import torch.utils.cpp_extension import pufferlib -import pufferlib.sweep import pufferlib.vector import pufferlib.pytorch try: @@ -1055,10 +1054,16 @@ def sweep(args=None, env_name=None): if not args['wandb'] and not args['neptune']: raise pufferlib.APIUsageError('Sweeps require either wandb or neptune') args['no_model_upload'] = True # Uploading trained model during sweep crashed wandb + try: + import pufferlib.sweep as puffer_sweep + except ImportError as exc: + raise ImportError( + 'Sweep mode requires optional sweep dependencies such as gpytorch.' + ) from exc method = args['sweep'].pop('method') try: - sweep_cls = getattr(pufferlib.sweep, method) + sweep_cls = getattr(puffer_sweep, method) except: raise pufferlib.APIUsageError(f'Invalid sweep method {method}. See pufferlib.sweep') diff --git a/tests/test_ocean_clifford.py b/tests/test_ocean_clifford.py new file mode 100644 index 0000000000..8fd8716d15 --- /dev/null +++ b/tests/test_ocean_clifford.py @@ -0,0 +1,334 @@ +import numpy as np +import pytest + +from pufferlib.ocean.clifford.reference import ( + ReferenceCliffordEnv, +) + +try: + import pufferlib.ocean.clifford.binding # noqa: F401 +except ImportError: + BINDING_AVAILABLE = False +else: + BINDING_AVAILABLE = True + from pufferlib.ocean.clifford.clifford import Clifford + + +def _apply_explicit(matrix, gate): + return ((matrix.astype(np.uint8) @ gate.astype(np.uint8)) % 2).astype(np.uint8) + + +def _identity_symplectic(n_qubits): + return np.eye(2 * n_qubits, dtype=np.uint8) + + +def _build_actions(n_qubits): + actions = [] + for gate in ("h", "s", "v", "hs", "hv"): + for qubit in range(n_qubits): + actions.append((gate, qubit, -1)) + for src in range(n_qubits): + for dst in range(src + 1, n_qubits): + actions.append(("cz", src, dst)) + return actions + + +def _symplectic_matrix_h(n_qubits, qubit): + gate = _identity_symplectic(n_qubits) + z_col = n_qubits + qubit + gate[:, [qubit, z_col]] = gate[:, [z_col, qubit]] + return gate + + +def _symplectic_matrix_s(n_qubits, qubit): + gate = _identity_symplectic(n_qubits) + gate[qubit, n_qubits + qubit] = 1 + return gate + + +def _symplectic_matrix_v(n_qubits, qubit): + gate = _symplectic_matrix_s(n_qubits, qubit) + gate = (gate @ _symplectic_matrix_h(n_qubits, qubit)) % 2 + gate = (gate @ _symplectic_matrix_s(n_qubits, qubit)) % 2 + return gate.astype(np.uint8, copy=False) + + +def _symplectic_matrix_hs(n_qubits, qubit): + gate = _symplectic_matrix_h(n_qubits, qubit) + gate = (gate @ _symplectic_matrix_s(n_qubits, qubit)) % 2 + return gate.astype(np.uint8, copy=False) + + +def _symplectic_matrix_hv(n_qubits, qubit): + gate = _symplectic_matrix_h(n_qubits, qubit) + gate = (gate @ _symplectic_matrix_v(n_qubits, qubit)) % 2 + return gate.astype(np.uint8, copy=False) + + +def _symplectic_matrix_cz(n_qubits, src, dst): + gate = _identity_symplectic(n_qubits) + gate[src, n_qubits + dst] = 1 + gate[dst, n_qubits + src] = 1 + return gate + + +def _find_action(actions, gate_name, q0, q1=-1): + for idx, action in enumerate(actions): + if action == (gate_name, q0, q1): + return idx + raise AssertionError(f"Action {(gate_name, q0, q1)} not found") + + +def _normalized_identity_hamming_distance(matrix): + identity = _identity_symplectic(matrix.shape[0] // 2) + return float(np.count_nonzero(matrix != identity) / matrix.size) + + +def test_build_actions_full_connectivity_order(): + actions = _build_actions(3) + assert len(actions) == 18 + assert actions[:5] == [ + ("h", 0, -1), + ("h", 1, -1), + ("h", 2, -1), + ("s", 0, -1), + ("s", 1, -1), + ] + assert actions[-3:] == [ + ("cz", 0, 1), + ("cz", 0, 2), + ("cz", 1, 2), + ] + + +def test_reference_reset_with_difficulty_zero_is_identity(): + env = ReferenceCliffordEnv(n_qubits=3, difficulty=0, max_steps=8) + obs, info = env.reset(seed=7) + np.testing.assert_array_equal(obs, _identity_symplectic(3).reshape(-1)) + assert info == {} + + +def test_reference_reset_with_difficulty_nonzero_is_not_identity(): + env = ReferenceCliffordEnv(n_qubits=3, difficulty=4, max_steps=8) + obs, _info = env.reset(seed=7) + assert not np.array_equal(obs, _identity_symplectic(3).reshape(-1)) + + +def test_reference_set_matrix_rejects_nonsymplectic_input(): + env = ReferenceCliffordEnv(n_qubits=2, difficulty=0, max_steps=8) + bad = np.zeros((4, 4), dtype=np.uint8) + with pytest.raises(ValueError, match="symplectic"): + env.set_matrix(bad) + + +@pytest.mark.parametrize("reward_mode", ["bad_mode", "dense_hamming", "end_hamming_left"]) +def test_reference_rejects_invalid_reward_mode(reward_mode): + with pytest.raises(ValueError, match="reward_mode"): + ReferenceCliffordEnv(n_qubits=2, reward_mode=reward_mode) + + +def test_reference_rejects_negative_hamming_left_scale(): + with pytest.raises(ValueError, match="hamming_left_scale"): + ReferenceCliffordEnv(n_qubits=2, hamming_left_scale=-0.1) + + +@pytest.mark.skipif(not BINDING_AVAILABLE, reason="native clifford binding is not built") +def test_native_env_shapes_and_dtypes(): + env = Clifford(n_qubits=3, difficulty=0, max_steps=8, num_envs=4) + try: + obs, info = env.reset(seed=11) + assert obs.shape == (4, 36) + assert obs.dtype == np.uint8 + assert info == [] + assert env.actions.dtype == np.int32 + assert env.rewards.dtype == np.float32 + assert env.terminals.dtype == np.bool_ + assert env.truncations.dtype == np.bool_ + finally: + env.close() + + +@pytest.mark.skipif(not BINDING_AVAILABLE, reason="native clifford binding is not built") +@pytest.mark.parametrize( + "action", + [ + ("h", 0, -1), + ("s", 1, -1), + ("v", 2, -1), + ("hs", 1, -1), + ("hv", 0, -1), + ("cz", 0, 2), + ], +) +def test_native_matches_reference_for_gate_updates(action): + env_ref = ReferenceCliffordEnv(n_qubits=3, difficulty=0, max_steps=10) + env_native = Clifford(n_qubits=3, difficulty=0, max_steps=10, num_envs=1) + try: + base = _identity_symplectic(3) + base = _apply_explicit(base, _symplectic_matrix_h(3, 1)) + base = _apply_explicit(base, _symplectic_matrix_s(3, 2)) + base = _apply_explicit(base, _symplectic_matrix_cz(3, 0, 2)) + action_idx = _find_action(env_ref._actions, *action) + + env_ref.reset(options={"matrix": base}) + env_native.reset(seed=0) + env_native.set_matrix(base) + + obs_ref, reward_ref, term_ref, trunc_ref, info_ref = env_ref.step(action_idx) + obs_native, rewards, terminals, truncations, info_native = env_native.step( + np.asarray([action_idx], dtype=np.int32) + ) + + np.testing.assert_array_equal(obs_native[0], obs_ref) + assert rewards[0] == pytest.approx(reward_ref, abs=1e-6) + assert bool(terminals[0]) == term_ref + assert bool(truncations[0]) == trunc_ref + assert info_ref == {} + assert info_native == [] + finally: + env_native.close() + + +@pytest.mark.skipif(not BINDING_AVAILABLE, reason="native clifford binding is not built") +@pytest.mark.parametrize( + "action", + [ + ("h", 0, -1), + ("s", 1, -1), + ("v", 2, -1), + ("hs", 1, -1), + ("hv", 0, -1), + ("cz", 0, 2), + ], +) +def test_native_matches_reference_for_hamming_left_rewards(action): + env_ref = ReferenceCliffordEnv( + n_qubits=3, + difficulty=0, + max_steps=10, + reward_mode="hamming_left", + hamming_left_scale=0.5, + ) + env_native = Clifford( + n_qubits=3, + difficulty=0, + max_steps=10, + num_envs=1, + reward_mode="hamming_left", + hamming_left_scale=0.5, + ) + try: + base = _identity_symplectic(3) + base = _apply_explicit(base, _symplectic_matrix_h(3, 1)) + base = _apply_explicit(base, _symplectic_matrix_s(3, 2)) + base = _apply_explicit(base, _symplectic_matrix_cz(3, 0, 2)) + action_idx = _find_action(env_ref._actions, *action) + + env_ref.reset(options={"matrix": base}) + env_native.reset(seed=0) + env_native.set_matrix(base) + + obs_ref, reward_ref, term_ref, trunc_ref, _info_ref = env_ref.step(action_idx) + obs_native, rewards, terminals, truncations, info_native = env_native.step( + np.asarray([action_idx], dtype=np.int32) + ) + + np.testing.assert_array_equal(obs_native[0], obs_ref) + assert rewards[0] == pytest.approx(reward_ref, abs=1e-6) + assert bool(terminals[0]) == term_ref + assert bool(truncations[0]) == trunc_ref + assert info_native == [] + finally: + env_native.close() + + +def test_reference_hamming_left_penalizes_remaining_distance(): + env = ReferenceCliffordEnv( + n_qubits=2, + difficulty=0, + max_steps=8, + single_qubit_cost=0.01, + reward_mode="hamming_left", + hamming_left_scale=0.5, + ) + env.reset(options={"matrix": _symplectic_matrix_h(2, 1)}) + action_idx = _find_action(_build_actions(2), "s", 0) + obs, reward, terminated, truncated, _info = env.step(action_idx) + matrix = obs.reshape(4, 4) + expected = -0.01 - 0.5 * _normalized_identity_hamming_distance(matrix) + assert reward == pytest.approx(expected, abs=1e-6) + assert not terminated + assert not truncated + + +@pytest.mark.skipif(not BINDING_AVAILABLE, reason="native clifford binding is not built") +def test_native_terminal_step_auto_resets_to_identity(): + env = Clifford( + n_qubits=2, + difficulty=0, + max_steps=8, + single_qubit_cost=0.01, + goal_bonus=5.0, + num_envs=1, + ) + try: + env.reset(seed=0) + env.set_matrix(_symplectic_matrix_h(2, 0)) + action_idx = _find_action(_build_actions(2), "h", 0) + obs, rewards, terminals, truncations, info = env.step(np.asarray([action_idx], dtype=np.int32)) + assert rewards[0] == pytest.approx(4.99, abs=1e-6) + assert bool(terminals[0]) + assert not bool(truncations[0]) + np.testing.assert_array_equal(obs[0], _identity_symplectic(2).reshape(-1)) + assert info == [] + finally: + env.close() + + +@pytest.mark.skipif(not BINDING_AVAILABLE, reason="native clifford binding is not built") +def test_native_hamming_left_terminal_reward_matches_reference(): + env = Clifford( + n_qubits=2, + difficulty=0, + max_steps=8, + single_qubit_cost=0.01, + goal_bonus=5.0, + reward_mode="hamming_left", + hamming_left_scale=0.5, + num_envs=1, + ) + try: + env.reset(seed=0) + env.set_matrix(_symplectic_matrix_h(2, 0)) + action_idx = _find_action(_build_actions(2), "h", 0) + obs, rewards, terminals, truncations, info = env.step(np.asarray([action_idx], dtype=np.int32)) + assert rewards[0] == pytest.approx(4.99, abs=1e-6) + assert bool(terminals[0]) + assert not bool(truncations[0]) + np.testing.assert_array_equal(obs[0], _identity_symplectic(2).reshape(-1)) + assert info == [] + finally: + env.close() + + +@pytest.mark.skipif(not BINDING_AVAILABLE, reason="native clifford binding is not built") +def test_native_set_matrix_rejects_nonsymplectic_input(): + env = Clifford(n_qubits=2, difficulty=0, max_steps=8, num_envs=1) + try: + bad = np.zeros((4, 4), dtype=np.uint8) + with pytest.raises(ValueError, match="symplectic"): + env.set_matrix(bad) + finally: + env.close() + + +@pytest.mark.skipif(not BINDING_AVAILABLE, reason="native clifford binding is not built") +def test_native_rejects_invalid_reward_mode(): + with pytest.raises(ValueError, match="reward_mode"): + Clifford(n_qubits=2, reward_mode="dense_hamming") + + +@pytest.mark.skipif(not BINDING_AVAILABLE, reason="native clifford binding is not built") +def test_native_rejects_negative_hamming_left_scale(): + with pytest.raises(ValueError, match="hamming_left_scale"): + Clifford(n_qubits=2, hamming_left_scale=-0.1)