Skip to content

Commit 1183bdd

Browse files
Replace Chapter 3 folder
1 parent c7c3577 commit 1183bdd

21 files changed

Lines changed: 312 additions & 168 deletions

ch3_multi_armed_bandits/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Chapter 3 — Multi-Armed Bandits
2+
3+
Implements ε-Greedy, UCB1, and Thompson Sampling on Bernoulli bandits.
4+
5+
## Run experiments
6+
```bash
7+
python -m ch3_multi_armed_bandits.experiments --K 10 --T 5000 --trials 50 --eps 0.1 --c 1.0
8+
```
9+
10+
## Worked Examples
11+
See `examples/` for scripts reproducing the numerical examples:
12+
- Example 3.1: `ex1_regret_basic.py`
13+
- Example 3.2: `ex2_epsilon_update.py`
14+
- Example 3.3: `ex3_ucb_score.py`
15+
- Example 3.4: `ex4_thompson_update.py`
16+
17+
## Tests
18+
```bash
19+
pytest -q ch3_multi_armed_bandits/tests
20+
```
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1+
__all__ = ["bandits", "epsilon_greedy", "ucb", "thompson", "experiments"]

ch3_multi_armed_bandits/bandits.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,32 @@
11
from __future__ import annotations
2-
import numpy as np
32
from dataclasses import dataclass
3+
from typing import Iterable, Optional
4+
import numpy as np
45

56
@dataclass
67
class BernoulliBandit:
7-
probs: np.ndarray
8+
p: Iterable[float]
9+
seed: Optional[int] = None
810

911
def __post_init__(self):
10-
self.probs = np.array(self.probs, dtype=float)
11-
assert self.probs.ndim == 1 and (0 <= self.probs).all() and (self.probs <= 1).all()
12-
self.K = self.probs.shape[0]
13-
self.opt_idx = int(np.argmax(self.probs))
14-
self.opt_mean = float(self.probs[self.opt_idx])
12+
self.p = np.asarray(list(self.p), dtype=float)
13+
if np.any(self.p < 0) or np.any(self.p > 1):
14+
raise ValueError("All probabilities must be in [0,1].")
15+
self.K = int(self.p.size)
16+
self._rng = np.random.default_rng(self.seed)
17+
18+
def step(self, arm: int) -> int:
19+
if not (0 <= arm < self.K):
20+
raise IndexError("Arm index out of range.")
21+
return int(self._rng.random() < self.p[arm])
22+
23+
def reset(self, seed: Optional[int] = None):
24+
self._rng = np.random.default_rng(seed)
1525

16-
def pull(self, arm: int, rng: np.random.Generator) -> float:
17-
return float(rng.random() < self.probs[arm])
26+
def regret_from_choices(true_means: np.ndarray, choices: np.ndarray, rewards: np.ndarray) -> np.ndarray:
27+
mu_star = float(np.max(true_means))
28+
t = np.arange(1, rewards.size + 1, dtype=float)
29+
return mu_star * t - np.cumsum(rewards)
1830

19-
def pseudo_regret(self, arm: int) -> float:
20-
return self.opt_mean - float(self.probs[arm])
31+
def ensure_rng(seed: Optional[int]) -> np.random.Generator:
32+
return np.random.default_rng(seed)
Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,26 @@
11
from __future__ import annotations
2+
from typing import Optional, Dict, Any
23
import numpy as np
4+
from .bandits import BernoulliBandit, regret_from_choices, ensure_rng
35

4-
class EpsilonGreedy:
5-
def __init__(self, K: int, epsilon: float = 0.1, rng: np.random.Generator | None = None):
6-
self.K = K
7-
self.epsilon = float(epsilon)
8-
self.rng = rng or np.random.default_rng()
9-
self.counts = np.zeros(K, dtype=int)
10-
self.values = np.zeros(K, dtype=float)
11-
12-
def select_arm(self) -> int:
13-
if self.rng.random() < self.epsilon:
14-
return int(self.rng.integers(self.K))
15-
return int(np.argmax(self.values))
16-
17-
def update(self, arm: int, reward: float):
18-
self.counts[arm] += 1
19-
n = self.counts[arm]
20-
self.values[arm] += (reward - self.values[arm]) / n
6+
def run(true_means, epsilon: float, steps: int, seed: Optional[int] = None) -> Dict[str, Any]:
7+
if not (0 <= float(epsilon) <= 1):
8+
raise ValueError("epsilon must be in [0,1].")
9+
K = len(true_means)
10+
env = BernoulliBandit(true_means, seed=seed)
11+
rng = ensure_rng(seed)
12+
Q, N = np.zeros(K), np.zeros(K, dtype=int)
13+
choices, rewards = np.zeros(steps, int), np.zeros(steps, float)
14+
for t in range(steps):
15+
if rng.random() < epsilon:
16+
a = rng.integers(0, K)
17+
else:
18+
a = int(np.argmax(Q))
19+
r = env.step(a)
20+
N[a] += 1
21+
Q[a] += (r - Q[a]) / N[a]
22+
choices[t], rewards[t] = a, r
23+
return {
24+
"rewards": rewards, "choices": choices, "Q": Q, "N": N,
25+
"cum_regret": regret_from_choices(np.asarray(true_means, float), choices, rewards),
26+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import numpy as np
2+
def compute():
3+
mu = np.array([0.5,0.6,0.7]); T=100; chosen=0
4+
mu_star = mu.max()
5+
regret = T*mu_star - T*mu[chosen]
6+
return {"optimal":np.argmax(mu)+1,"mu*":mu_star,"chosen_mu":mu[chosen],"regret":regret}
7+
if __name__=="__main__": print(compute())
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
def compute():
2+
N,Q,R=4,0.5,1; N_new=N+1
3+
Q_new=Q+(R-Q)/N_new
4+
return {"N":N,"N_new":N_new,"Q_new":Q_new}
5+
if __name__=="__main__": print(compute())
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import numpy as np, math
2+
def compute():
3+
Q=[0.6,0.7,0.4]; N=[5,10,2]; t=20;c=1.0
4+
ucb=[Q[a]+c*math.sqrt(math.log(t)/N[a]) for a in range(3)]
5+
return {"ucb":ucb,"selected":np.argmax(ucb)+1}
6+
if __name__=="__main__": print(compute())
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
def compute():
2+
alpha=[1+3,1+1]; beta=[1+2,1+4]
3+
return {"alpha":alpha,"beta":beta}
4+
if __name__=="__main__": print(compute())
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import ex1_regret_basic,ex2_epsilon_update,ex3_ucb_score,ex4_thompson_update
2+
print(ex1_regret_basic.compute())
3+
print(ex2_epsilon_update.compute())
4+
print(ex3_ucb_score.compute())
5+
print(ex4_thompson_update.compute())
Lines changed: 39 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,44 @@
1-
from __future__ import annotations
2-
import os
3-
import numpy as np
4-
import matplotlib.pyplot as plt
5-
from .bandits import BernoulliBandit
6-
from .epsilon_greedy import EpsilonGreedy
7-
from .ucb import UCB1
8-
from .thompson import ThompsonSamplingBernoulli
1+
import argparse, os, numpy as np, matplotlib.pyplot as plt
2+
from .epsilon_greedy import run as run_eps
3+
from .ucb import run as run_ucb
4+
from .thompson import run as run_ts
95

10-
def run_algorithm(env, algo, T: int, seed: int) -> dict:
11-
rng = np.random.default_rng(seed)
12-
rewards = np.zeros(T, dtype=float)
13-
regret = np.zeros(T, dtype=float)
14-
for t in range(T):
15-
a = algo.select_arm()
16-
r = env.pull(a, rng)
17-
algo.update(a, r)
18-
rewards[t] = r
19-
regret[t] = env.pseudo_regret(a)
20-
return {
21-
"rewards": rewards,
22-
"cum_rewards": np.cumsum(rewards),
23-
"regret": regret,
24-
"cum_regret": np.cumsum(regret),
25-
}
6+
def parse_args():
7+
p = argparse.ArgumentParser()
8+
p.add_argument("--K", type=int, default=10)
9+
p.add_argument("--T", type=int, default=5000)
10+
p.add_argument("--trials", type=int, default=50)
11+
p.add_argument("--eps", type=float, default=0.1)
12+
p.add_argument("--c", type=float, default=1.0)
13+
p.add_argument("--seed", type=int, default=123)
14+
p.add_argument("--outdir", type=str, default="ch3_multi_armed_bandits/plots")
15+
return p.parse_args()
16+
17+
def make_true_means(K, rng): return rng.uniform(0.1, 0.9, size=K)
2618

27-
def average_over_runs(env, algo_ctor, T: int, n_runs: int, base_seed: int = 0) -> dict:
28-
cum_regrets = []
29-
for run in range(n_runs):
30-
algo = algo_ctor()
31-
result = run_algorithm(env, algo, T, seed=base_seed + run)
32-
cum_regrets.append(result["cum_regret"])
33-
cum_regrets = np.array(cum_regrets)
34-
mean = cum_regrets.mean(axis=0)
35-
se = cum_regrets.std(axis=0, ddof=1) / np.sqrt(n_runs)
36-
return {"mean": mean, "se": se}
19+
def run_all(true_means, T, trials, eps, c, seed):
20+
rng = np.random.default_rng(seed)
21+
avg_regret = {"eps": np.zeros(T), "ucb": np.zeros(T), "ts": np.zeros(T)}
22+
for _ in range(trials):
23+
s = int(rng.integers(0, 2**31-1))
24+
avg_regret["eps"] += run_eps(true_means, eps, T, s)["cum_regret"]
25+
avg_regret["ucb"] += run_ucb(true_means, c, T, s)["cum_regret"]
26+
avg_regret["ts"] += run_ts(true_means, T, s)["cum_regret"]
27+
for k in avg_regret: avg_regret[k] /= trials
28+
return avg_regret
3729

38-
def plot_regret(curves: dict, title: str, fname: str | None):
39-
fig, ax = plt.subplots()
40-
for label, stats in curves.items():
41-
ax.plot(stats["mean"], label=label)
42-
ax.set_xlabel("Time")
43-
ax.set_ylabel("Average cumulative pseudo-regret")
44-
ax.set_title(title)
45-
ax.legend()
46-
if fname:
47-
out_dir = os.path.dirname(fname)
48-
if out_dir and not os.path.exists(out_dir):
49-
os.makedirs(out_dir, exist_ok=True)
50-
fig.savefig(fname, bbox_inches="tight")
51-
else:
52-
plt.show()
30+
def plot(xs, series, ylabel, title, outpath):
31+
plt.figure()
32+
for label,y in series: plt.plot(xs,y,label=label)
33+
plt.xlabel("Time"); plt.ylabel(ylabel); plt.title(title); plt.legend()
34+
os.makedirs(os.path.dirname(outpath), exist_ok=True)
35+
plt.savefig(outpath, dpi=300); plt.close()
5336

5437
def main():
55-
probs = np.array([0.2, 0.25, 0.3, 0.35, 0.5])
56-
env = BernoulliBandit(probs=probs)
57-
T = 2000
58-
n_runs = 200
59-
curves = {}
60-
curves["ε-greedy(0.10)"] = average_over_runs(env, lambda: EpsilonGreedy(env.K, 0.10), T, n_runs, 123)
61-
curves["ε-greedy(0.01)"] = average_over_runs(env, lambda: EpsilonGreedy(env.K, 0.01), T, n_runs, 223)
62-
curves["UCB1(c=0.5)"] = average_over_runs(env, lambda: UCB1(env.K, c=0.5), T, n_runs, 323)
63-
curves["Thompson (Beta-Bernoulli)"] = average_over_runs(env, lambda: ThompsonSamplingBernoulli(env.K), T, n_runs, 423)
64-
here = os.path.dirname(__file__)
65-
out_path = os.path.join(here, "plots", "regret_bernoulli.png")
66-
plot_regret(curves, "Multi-Armed Bandits: Average Cumulative Pseudo-Regret", out_path)
67-
print(f"Saved plot to {out_path}")
68-
69-
if __name__ == "__main__":
70-
main()
38+
a = parse_args()
39+
true_means = make_true_means(a.K, np.random.default_rng(a.seed))
40+
xs = np.arange(1, a.T+1)
41+
reg = run_all(true_means,a.T,a.trials,a.eps,a.c,a.seed)
42+
plot(xs,[("ε-Greedy",reg["eps"]),("UCB1",reg["ucb"]),("Thompson",reg["ts"])],
43+
"Cumulative Regret","Regret vs Time",os.path.join(a.outdir,"regret.png"))
44+
if __name__=="__main__": main()

0 commit comments

Comments
 (0)