From 69e81b574dc99774215bd25bf287fd9f58b2672d Mon Sep 17 00:00:00 2001
From: Charles Moslonka <charles.moslonka@artefact.com>
Date: Fri, 22 May 2026 11:54:30 +0200
Subject: [PATCH 1/8] MAINT: add web deps

---
 pyproject.toml |   3 +
 uv.lock        | 159 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index cf0bb03..0d39bc9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,9 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
+    "bleach>=6.3.0",
+    "flask>=3.1.3",
+    "markdown>=3.10.2",
     "openai>=2.33.0",
     "pydantic>=2.13.3",
     "tqdm>=4.67.3",
diff --git a/uv.lock b/uv.lock
index c0651b8..869ffcd 100644
--- a/uv.lock
+++ b/uv.lock
@@ -34,6 +34,9 @@ name = "ardian-dataset-bench"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
+    { name = "bleach" },
+    { name = "flask" },
+    { name = "markdown" },
     { name = "openai" },
     { name = "pydantic" },
     { name = "tqdm" },
@@ -42,6 +45,9 @@ dependencies = [
 
 [package.metadata]
 requires-dist = [
+    { name = "bleach", specifier = ">=6.3.0" },
+    { name = "flask", specifier = ">=3.1.3" },
+    { name = "markdown", specifier = ">=3.10.2" },
     { name = "openai", specifier = ">=2.33.0" },
     { name = "pydantic", specifier = ">=2.13.3" },
     { name = "tqdm", specifier = ">=4.67.3" },
@@ -61,6 +67,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
 ]
 
+[[package]]
+name = "bleach"
+version = "6.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "webencodings" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/07/18/3c8523962314be6bf4c8989c79ad9531c825210dd13a8669f6b84336e8bd/bleach-6.3.0.tar.gz", hash = "sha256:6f3b91b1c0a02bb9a78b5a454c92506aa0fdf197e1d5e114d2e00c6f64306d22", size = 203533, upload-time = "2025-10-27T17:57:39.211Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
+]
+
+[[package]]
+name = "blinker"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
+]
+
 [[package]]
 name = "certifi"
 version = "2026.4.22"
@@ -172,6 +199,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" },
 ]
 
+[[package]]
+name = "click"
+version = "8.4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9b/98/518d8e5081007684232226f475082b30087d0f585e8457db087298259f49/click-8.4.1.tar.gz", hash = "sha256:918b5633eddf6b41c32d4f454bf0de810065c74e3f7dbf8ee5452f8be88d3e96", size = 353007, upload-time = "2026-05-22T04:08:37.769Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/0d/67e5b4109ea4a837e80daa87c2c696711955e40449a97e8926672534def2/click-8.4.1-py3-none-any.whl", hash = "sha256:482be17c6991b8c19c5429a1e995d9b0efdbb63172824c41f99965dc0ade8ec2", size = 116639, upload-time = "2026-05-22T04:08:35.26Z" },
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -223,6 +262,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
 ]
 
+[[package]]
+name = "flask"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "blinker" },
+    { name = "click" },
+    { name = "itsdangerous" },
+    { name = "jinja2" },
+    { name = "markupsafe" },
+    { name = "werkzeug" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" },
+]
+
 [[package]]
 name = "frozendict"
 version = "2.4.7"
@@ -278,6 +334,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl", hash = "sha256:892ea0cde124a99ce773decba204c5552b69c3c67ffd5f232eb7696135bc8bb3", size = 68629, upload-time = "2026-04-22T16:42:40.909Z" },
 ]
 
+[[package]]
+name = "itsdangerous"
+version = "2.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
+]
+
 [[package]]
 name = "jiter"
 version = "0.14.0"
@@ -332,6 +409,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dc/2e/a44c20c58aeed0355f2d326969a181696aeb551a25195f47563908a815be/jiter-0.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:5419d4aa2024961da9fe12a9cfe7484996735dca99e8e090b5c88595ef1951ff", size = 191338, upload-time = "2026-04-10T14:28:02.853Z" },
 ]
 
+[[package]]
+name = "markdown"
+version = "3.10.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" },
+]
+
 [[package]]
 name = "markdown-it-py"
 version = "4.0.0"
@@ -344,6 +430,58 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
 ]
 
+[[package]]
+name = "markupsafe"
+version = "3.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" },
+    { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" },
+    { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" },
+    { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" },
+    { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" },
+    { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" },
+    { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" },
+    { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
+    { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" },
+    { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" },
+    { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" },
+    { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" },
+    { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" },
+    { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" },
+    { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" },
+    { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" },
+    { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" },
+    { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
+    { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
+]
+
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@@ -698,6 +836,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
 ]
 
+[[package]]
+name = "webencodings"
+version = "0.5.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0b/02/ae6ceac1baeda530866a85075641cec12989bd8d31af6d5ab4a3e8c92f47/webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923", size = 9721, upload-time = "2017-04-05T20:21:34.189Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774, upload-time = "2017-04-05T20:21:32.581Z" },
+]
+
 [[package]]
 name = "websockets"
 version = "16.0"
@@ -734,6 +881,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" },
 ]
 
+[[package]]
+name = "werkzeug"
+version = "3.1.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dd/b2/381be8cfdee792dd117872481b6e378f85c957dd7c5bca38897b08f765fd/werkzeug-3.1.8.tar.gz", hash = "sha256:9bad61a4268dac112f1c5cd4630a56ede601b6ed420300677a869083d70a4c44", size = 875852, upload-time = "2026-04-02T18:49:14.268Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/93/8c/2e650f2afeb7ee576912636c23ddb621c91ac6a98e66dc8d29c3c69446e1/werkzeug-3.1.8-py3-none-any.whl", hash = "sha256:63a77fb8892bf28ebc3178683445222aa500e48ebad5ec77b0ad80f8726b1f50", size = 226459, upload-time = "2026-04-02T18:49:12.72Z" },
+]
+
 [[package]]
 name = "yfinance"
 version = "1.3.0"

From 6e441d9a9bd17288a288da558ad13f8001498ada Mon Sep 17 00:00:00 2001
From: Charles Moslonka <charles.moslonka@artefact.com>
Date: Fri, 22 May 2026 11:54:30 +0200
Subject: [PATCH 2/8] MAINT: ignore OCR sessions

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 894a2bc..4db1aa8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,9 @@ doc_text_processing/CEO_word_extraction/cleaning_extractions/cleaned/
 KPI_analysis/cache/
 KPI_analysis/output/
 
+# OCR annotation artifacts
+annotation_OCR/sessions/
+
 # VSCode settings
 .vscode/settings.json
 

From 1e3f055c58dc74e85bed6676bb6f355cae1fe935 Mon Sep 17 00:00:00 2001
From: Charles Moslonka <charles.moslonka@artefact.com>
Date: Fri, 22 May 2026 11:54:35 +0200
Subject: [PATCH 3/8] ENH: add OCR annotator

---
 annotation_OCR/__init__.py          |   1 +
 annotation_OCR/ocr_index.py         | 604 ++++++++++++++++++++++++++++
 annotation_OCR/server.py            | 365 +++++++++++++++++
 annotation_OCR/static/app.js        | 352 ++++++++++++++++
 annotation_OCR/static/style.css     | 448 +++++++++++++++++++++
 annotation_OCR/store.py             | 400 ++++++++++++++++++
 annotation_OCR/summarize.py         |  58 +++
 annotation_OCR/templates/index.html | 204 ++++++++++
 8 files changed, 2432 insertions(+)
 create mode 100644 annotation_OCR/__init__.py
 create mode 100644 annotation_OCR/ocr_index.py
 create mode 100644 annotation_OCR/server.py
 create mode 100644 annotation_OCR/static/app.js
 create mode 100644 annotation_OCR/static/style.css
 create mode 100644 annotation_OCR/store.py
 create mode 100644 annotation_OCR/summarize.py
 create mode 100644 annotation_OCR/templates/index.html

diff --git a/annotation_OCR/__init__.py b/annotation_OCR/__init__.py
new file mode 100644
index 0000000..e045a18
--- /dev/null
+++ b/annotation_OCR/__init__.py
@@ -0,0 +1 @@
+"""OCR annotation interface package."""
diff --git a/annotation_OCR/ocr_index.py b/annotation_OCR/ocr_index.py
new file mode 100644
index 0000000..8916981
--- /dev/null
+++ b/annotation_OCR/ocr_index.py
@@ -0,0 +1,604 @@
+"""Build page-level OCR annotation queues.
+
+The annotation UI compares one raw page image with the corresponding Markdown
+page extracted by DeepSeekOCR. Page positions are preserved exactly: page index
+``i`` in an ``.mmd`` split maps to ``pages/page_XXXX.png`` with the same
+zero-based index when the raw image exists.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import random
+import re
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+
+HERE = Path(__file__).resolve().parent
+REPO_ROOT = HERE.parent
+
+DEFAULT_OCR_ROOT = REPO_ROOT / "DeepSeekOCR_Ardian_pruned_1k"
+DEFAULT_RAW_ROOT = Path(
+    "/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs"
+)
+
+PAGE_SPLIT_RE = re.compile(r"<---\s*Page Split\s*--->", re.IGNORECASE)
+REPORT_NAME_RE = re.compile(r"^([A-Z0-9-]+)_(.+)_(\d{4})(?:_[0-9a-fA-F]{8,})?$")
+HASH_SUFFIX_RE = re.compile(r"_[0-9a-fA-F]{8,}$")
+
+CORE_KPI_ALIASES = {
+    "revenue": [
+        "net sales",
+        "total net sales",
+        "sales revenue",
+        "revenues",
+        "revenue",
+        "net revenue",
+    ],
+    "gross_profit": ["gross profit", "gross margin"],
+    "operating_income": [
+        "operating income",
+        "income from operations",
+        "operating profit",
+    ],
+    "net_income": [
+        "net income",
+        "net earnings",
+        "net loss",
+        "net income attributable",
+    ],
+    "total_assets": ["total assets"],
+    "total_liabilities": ["total liabilities", "liabilities"],
+    "cash_and_equivalents": [
+        "cash and cash equivalents",
+        "cash equivalents",
+        "cash, cash equivalents",
+    ],
+    "operating_cash_flow": [
+        "net cash provided by operating activities",
+        "cash flow from operating activities",
+        "operating cash flow",
+    ],
+    "capex": [
+        "capital expenditures",
+        "capital expenditure",
+        "additions to property, plant and equipment",
+        "purchase of property and equipment",
+        "additions of long-lived assets",
+    ],
+}
+
+FINANCIAL_TABLE_HEADINGS = [
+    "consolidated statement of operations",
+    "consolidated statements of operations",
+    "consolidated income statement",
+    "consolidated statements of income",
+    "consolidated balance sheet",
+    "consolidated balance sheets",
+    "consolidated cash flow statement",
+    "consolidated statements of cash flows",
+    "consolidated statement of cash flows",
+    "statements of comprehensive income",
+    "statement of financial position",
+    "notes to the consolidated financial statements",
+    "selected financial data",
+    "five year record",
+]
+
+NUMERIC_ROW_RE = re.compile(
+    r"(?<![A-Za-z])\(?\$?\d{1,3}(?:[,\s]\d{3})+(?:\.\d+)?\)?|(?<![A-Za-z])\$?\d+\.\d+"
+)
+MARKDOWN_TABLE_SEPARATOR_RE = re.compile(
+    r"^\s*\|?\s*:?-{3,}:?\s*(?:\|\s*:?-{3,}:?\s*)+\|?\s*$"
+)
+
+
+@dataclass(frozen=True)
+class ReportInfo:
+    industry_slug: str
+    name: str
+    exchange: str
+    ticker: str
+    year: int
+    report_dir: Path
+    mmd_path: Path
+
+
+@dataclass
+class PageItem:
+    item_id: str
+    industry_slug: str
+    report_name: str
+    exchange: str
+    ticker: str
+    year: int
+    page_index: int
+    page_number: int
+    ocr_root: str
+    raw_root: str
+    report_dir: str
+    raw_dir: str | None
+    mmd_path: str
+    raw_png_path: str | None
+    mmd_page_count: int
+    png_page_count: int
+    mapping_status: str
+    mapping_warnings: list[str]
+    candidate_reasons: list[str]
+    page_text_sha256: str
+    page_text_chars: int
+    page_text_preview: str
+    page_text: str
+
+    def to_manifest_record(self, *, include_text: bool = False) -> dict[str, Any]:
+        record = asdict(self)
+        if not include_text:
+            record.pop("page_text", None)
+        return record
+
+
+def parse_report_name(name: str) -> tuple[str, str, int] | None:
+    match = REPORT_NAME_RE.match(name)
+    if not match:
+        return None
+    return match.group(1), match.group(2), int(match.group(3))
+
+
+def strip_hash_suffix(name: str) -> str:
+    return HASH_SUFFIX_RE.sub("", name)
+
+
+def report_base_name(name: str) -> str:
+    parsed = parse_report_name(name)
+    if parsed is None:
+        return strip_hash_suffix(name)
+    exchange, ticker, year = parsed
+    return f"{exchange}_{ticker}_{year}"
+
+
+def find_mmd(report_dir: Path) -> Path | None:
+    preferred = report_dir / f"{report_dir.name}.mmd"
+    if preferred.is_file():
+        return preferred
+
+    base_preferred = report_dir / f"{report_base_name(report_dir.name)}.mmd"
+    if base_preferred.is_file():
+        return base_preferred
+
+    candidates = sorted(
+        path for path in report_dir.glob("*.mmd") if not path.name.endswith("_det.mmd")
+    )
+    if candidates:
+        return candidates[0]
+
+    fallback = sorted(report_dir.glob("*.mmd"))
+    return fallback[0] if fallback else None
+
+
+def discover_reports(root: Path) -> list[ReportInfo]:
+    reports: list[ReportInfo] = []
+    seen_dirs = sorted({mmd.parent for mmd in root.rglob("*.mmd")})
+    for report_dir in seen_dirs:
+        parsed = parse_report_name(report_dir.name)
+        if parsed is None:
+            continue
+        mmd_path = find_mmd(report_dir)
+        if mmd_path is None:
+            continue
+        exchange, ticker, year = parsed
+        industry_slug = report_dir.parent.name
+        reports.append(
+            ReportInfo(
+                industry_slug=industry_slug,
+                name=report_dir.name,
+                exchange=exchange,
+                ticker=ticker,
+                year=year,
+                report_dir=report_dir,
+                mmd_path=mmd_path,
+            )
+        )
+    return reports
+
+
+def split_pages(raw: str) -> list[str]:
+    pages = [page.strip() for page in PAGE_SPLIT_RE.split(raw)]
+    if pages and not pages[-1]:
+        pages.pop()
+    return pages
+
+
+def load_pages(mmd_path: Path) -> list[str]:
+    raw = mmd_path.read_text(encoding="utf-8", errors="replace")
+    return split_pages(raw)
+
+
+def resolve_raw_dir(report: ReportInfo, raw_root: Path) -> tuple[Path | None, str]:
+    industry_root = raw_root / report.industry_slug
+    if not industry_root.is_dir():
+        return None, "raw-industry-missing"
+
+    exact = industry_root / report.name
+    if exact.is_dir():
+        return exact, "ok-exact"
+
+    base_name = report_base_name(report.name)
+    stripped = industry_root / base_name
+    if stripped.is_dir():
+        return stripped, "ok-hash-stripped"
+
+    matches = sorted(
+        path for path in industry_root.glob(f"{base_name}*") if path.is_dir()
+    )
+    if len(matches) == 1:
+        return matches[0], "ok-glob"
+    if len(matches) > 1:
+        return None, "raw-dir-ambiguous"
+    return None, "raw-dir-missing"
+
+
+def list_page_pngs(raw_dir: Path | None) -> list[Path]:
+    if raw_dir is None:
+        return []
+    pages_dir = raw_dir / "pages"
+    if not pages_dir.is_dir():
+        return []
+    return sorted(p for p in pages_dir.glob("page_*.png") if p.is_file())
+
+
+def page_png_for(page_pngs: list[Path], page_index: int) -> Path | None:
+    expected_name = f"page_{page_index:04d}.png"
+    for path in page_pngs:
+        if path.name == expected_name:
+            return path
+    if 0 <= page_index < len(page_pngs):
+        return page_pngs[page_index]
+    return None
+
+
+def has_markdown_table(lines: list[str]) -> bool:
+    if any(MARKDOWN_TABLE_SEPARATOR_RE.match(line) for line in lines):
+        return True
+    pipe_rows = sum(1 for line in lines if line.count("|") >= 2)
+    return pipe_rows >= 2
+
+
+def dense_numeric_row_count(lines: list[str]) -> int:
+    return sum(1 for line in lines if len(NUMERIC_ROW_RE.findall(line)) >= 3)
+
+
+def detect_candidate_reasons(text: str) -> list[str]:
+    lowered = text.lower()
+    lines = [line.strip() for line in text.splitlines() if line.strip()]
+    reasons: list[str] = []
+
+    if has_markdown_table(lines):
+        reasons.append("markdown-table")
+    if "<table" in lowered or "</td>" in lowered or "</tr>" in lowered:
+        reasons.append("html-table")
+
+    numeric_rows = dense_numeric_row_count(lines)
+    if numeric_rows >= 3:
+        reasons.append("dense-numeric-rows")
+
+    if any(heading in lowered for heading in FINANCIAL_TABLE_HEADINGS):
+        reasons.append("financial-heading")
+
+    aliases = sorted({alias for vals in CORE_KPI_ALIASES.values() for alias in vals})
+    alias_hits = [alias for alias in aliases if alias in lowered]
+    if len(alias_hits) >= 2:
+        reasons.append("kpi-aliases")
+
+    return reasons
+
+
+def text_preview(text: str, max_chars: int = 500) -> str:
+    compact = " ".join(text.split())
+    if len(compact) <= max_chars:
+        return compact
+    return compact[: max_chars - 1].rstrip() + "..."
+
+
+def page_text_hash(text: str) -> str:
+    return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
+
+
+def make_mapping_warnings(
+    *, raw_dir: Path | None, page_pngs: list[Path], page_index: int, mmd_page_count: int
+) -> list[str]:
+    warnings: list[str] = []
+    if raw_dir is None:
+        warnings.append("raw-directory-missing")
+    elif not (raw_dir / "pages").is_dir():
+        warnings.append("raw-pages-directory-missing")
+    if len(page_pngs) != mmd_page_count:
+        warnings.append("page-count-mismatch")
+    if page_png_for(page_pngs, page_index) is None:
+        warnings.append("raw-page-image-missing")
+    return warnings
+
+
+def build_all_items(
+    *,
+    ocr_root: Path,
+    raw_root: Path,
+    limit_reports: int | None = None,
+) -> list[PageItem]:
+    return list(
+        iter_page_items(
+            ocr_root=ocr_root,
+            raw_root=raw_root,
+            limit_reports=limit_reports,
+        )
+    )
+
+
+def iter_page_items(
+    *,
+    ocr_root: Path,
+    raw_root: Path,
+    limit_reports: int | None = None,
+):
+    reports = discover_reports(ocr_root)
+    if limit_reports is not None:
+        reports = reports[:limit_reports]
+
+    for report in reports:
+        pages = load_pages(report.mmd_path)
+        raw_dir, raw_status = resolve_raw_dir(report, raw_root)
+        page_pngs = list_page_pngs(raw_dir)
+        mmd_page_count = len(pages)
+        png_page_count = len(page_pngs)
+
+        for page_index, page_text in enumerate(pages):
+            raw_png = page_png_for(page_pngs, page_index)
+            warnings = make_mapping_warnings(
+                raw_dir=raw_dir,
+                page_pngs=page_pngs,
+                page_index=page_index,
+                mmd_page_count=mmd_page_count,
+            )
+            reasons = detect_candidate_reasons(page_text)
+            item_id = f"{report.industry_slug}/{report.name}/page_{page_index:04d}"
+            yield PageItem(
+                item_id=item_id,
+                industry_slug=report.industry_slug,
+                report_name=report.name,
+                exchange=report.exchange,
+                ticker=report.ticker,
+                year=report.year,
+                page_index=page_index,
+                page_number=page_index + 1,
+                ocr_root=str(ocr_root),
+                raw_root=str(raw_root),
+                report_dir=str(report.report_dir),
+                raw_dir=str(raw_dir) if raw_dir else None,
+                mmd_path=str(report.mmd_path),
+                raw_png_path=str(raw_png) if raw_png else None,
+                mmd_page_count=mmd_page_count,
+                png_page_count=png_page_count,
+                mapping_status=raw_status,
+                mapping_warnings=warnings,
+                candidate_reasons=reasons,
+                page_text_sha256=page_text_hash(page_text),
+                page_text_chars=len(page_text),
+                page_text_preview=text_preview(page_text),
+                page_text="",
+            )
+
+
+def new_summary_state() -> dict[str, Any]:
+    return {
+        "report_names": set(),
+        "pages_total": 0,
+        "mapping_status_counts": {},
+        "mapping_warning_counts": {},
+        "candidate_reason_counts": {},
+    }
+
+
+def update_summary_state(state: dict[str, Any], item: PageItem) -> None:
+    state["report_names"].add(item.report_name)
+    state["pages_total"] += 1
+    statuses = state["mapping_status_counts"]
+    statuses[item.mapping_status] = statuses.get(item.mapping_status, 0) + 1
+    warnings = state["mapping_warning_counts"]
+    for warning in item.mapping_warnings:
+        warnings[warning] = warnings.get(warning, 0) + 1
+    reasons = state["candidate_reason_counts"]
+    for reason in item.candidate_reasons:
+        reasons[reason] = reasons.get(reason, 0) + 1
+
+
+def finish_summary_state(
+    state: dict[str, Any], queue: list[PageItem]
+) -> dict[str, Any]:
+    return {
+        "reports_total": len(state["report_names"]),
+        "pages_total": state["pages_total"],
+        "queue_reports": len({item.report_name for item in queue}),
+        "queue_pages": len(queue),
+        "mapping_status_counts": state["mapping_status_counts"],
+        "mapping_warning_counts": state["mapping_warning_counts"],
+        "candidate_reason_counts": state["candidate_reason_counts"],
+    }
+
+
+def select_queue(
+    items: list[PageItem],
+    *,
+    queue_mode: str,
+    sample_size: int | None = None,
+    seed: int = 17,
+    limit: int | None = None,
+) -> list[PageItem]:
+    if queue_mode == "all":
+        selected = list(items)
+    elif queue_mode == "table-candidates":
+        selected = [item for item in items if item.candidate_reasons]
+    elif queue_mode == "sample":
+        size = sample_size if sample_size is not None else 100
+        rng = random.Random(seed)
+        selected = rng.sample(items, min(size, len(items)))
+        selected.sort(
+            key=lambda item: (item.industry_slug, item.report_name, item.page_index)
+        )
+    else:
+        raise ValueError(f"unknown queue mode: {queue_mode}")
+
+    if limit is not None:
+        selected = selected[:limit]
+    return selected
+
+
+def build_queue(
+    *,
+    ocr_root: Path,
+    raw_root: Path,
+    queue_mode: str = "table-candidates",
+    sample_size: int | None = None,
+    seed: int = 17,
+    limit: int | None = None,
+    limit_reports: int | None = None,
+) -> tuple[list[PageItem], dict[str, Any]]:
+    if queue_mode not in {"all", "table-candidates", "sample"}:
+        raise ValueError(f"unknown queue mode: {queue_mode}")
+
+    queue: list[PageItem] = []
+    summary_state = new_summary_state()
+    rng = random.Random(seed)
+    sample_seen = 0
+    sample_target = sample_size if sample_size is not None else 100
+    scan_stopped_by_limit = False
+
+    for item in iter_page_items(
+        ocr_root=ocr_root,
+        raw_root=raw_root,
+        limit_reports=limit_reports,
+    ):
+        update_summary_state(summary_state, item)
+        if queue_mode == "sample":
+            sample_seen += 1
+            if len(queue) < sample_target:
+                queue.append(item)
+            else:
+                replace_at = rng.randint(0, sample_seen - 1)
+                if replace_at < sample_target:
+                    queue[replace_at] = item
+            continue
+
+        include_item = queue_mode == "all" or bool(item.candidate_reasons)
+        if not include_item:
+            continue
+        queue.append(item)
+        if limit is not None and len(queue) >= limit:
+            scan_stopped_by_limit = True
+            break
+
+    if queue_mode == "sample":
+        queue.sort(
+            key=lambda item: (item.industry_slug, item.report_name, item.page_index)
+        )
+        if limit is not None:
+            queue = queue[:limit]
+
+    summary = finish_summary_state(summary_state, queue)
+    summary.update(
+        {
+            "queue_mode": queue_mode,
+            "sample_size": sample_size,
+            "seed": seed,
+            "limit": limit,
+            "limit_reports": limit_reports,
+            "scan_stopped_by_limit": scan_stopped_by_limit,
+            "ocr_root": str(ocr_root),
+            "raw_root": str(raw_root),
+        }
+    )
+    return queue, summary
+
+
+def summarize_items(all_items: list[PageItem], queue: list[PageItem]) -> dict[str, Any]:
+    report_names = {item.report_name for item in all_items}
+    queue_reports = {item.report_name for item in queue}
+    warnings: dict[str, int] = {}
+    statuses: dict[str, int] = {}
+    reason_counts: dict[str, int] = {}
+    for item in all_items:
+        statuses[item.mapping_status] = statuses.get(item.mapping_status, 0) + 1
+        for warning in item.mapping_warnings:
+            warnings[warning] = warnings.get(warning, 0) + 1
+        for reason in item.candidate_reasons:
+            reason_counts[reason] = reason_counts.get(reason, 0) + 1
+    return {
+        "reports_total": len(report_names),
+        "pages_total": len(all_items),
+        "queue_reports": len(queue_reports),
+        "queue_pages": len(queue),
+        "mapping_status_counts": statuses,
+        "mapping_warning_counts": warnings,
+        "candidate_reason_counts": reason_counts,
+    }
+
+
+def write_json(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    tmp.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    tmp.replace(path)
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Build an OCR page annotation queue.")
+    parser.add_argument("--ocr-root", type=Path, default=DEFAULT_OCR_ROOT)
+    parser.add_argument("--raw-root", type=Path, default=DEFAULT_RAW_ROOT)
+    parser.add_argument(
+        "--queue-mode",
+        choices=["all", "table-candidates", "sample"],
+        default="table-candidates",
+    )
+    parser.add_argument("--sample-size", type=int, default=None)
+    parser.add_argument("--seed", type=int, default=17)
+    parser.add_argument("--limit", type=int, default=None, help="Maximum queued pages.")
+    parser.add_argument(
+        "--limit-reports",
+        type=int,
+        default=None,
+        help="Read only the first N reports before queue selection.",
+    )
+    parser.add_argument(
+        "--output", type=Path, default=None, help="Optional manifest JSON path."
+    )
+    parser.add_argument("--check", action="store_true", help="Print summary and exit.")
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_arg_parser().parse_args(argv)
+    queue, summary = build_queue(
+        ocr_root=args.ocr_root,
+        raw_root=args.raw_root,
+        queue_mode=args.queue_mode,
+        sample_size=args.sample_size,
+        seed=args.seed,
+        limit=args.limit,
+        limit_reports=args.limit_reports,
+    )
+
+    payload = {
+        "summary": summary,
+        "items": [item.to_manifest_record() for item in queue],
+    }
+    if args.output:
+        write_json(args.output, payload)
+    if args.check or not args.output:
+        print(json.dumps(summary, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/annotation_OCR/server.py b/annotation_OCR/server.py
new file mode 100644
index 0000000..6ea3793
--- /dev/null
+++ b/annotation_OCR/server.py
@@ -0,0 +1,365 @@
+"""Browser-based OCR page annotation server."""
+
+from __future__ import annotations
+
+import argparse
+import re
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+
+import bleach
+import markdown as markdown_lib
+from flask import Flask, abort, jsonify, render_template, request, send_file
+
+from ocr_index import DEFAULT_OCR_ROOT, DEFAULT_RAW_ROOT, build_queue, load_pages
+from store import (
+    create_session,
+    list_sessions,
+    load_current_annotations,
+    load_manifest,
+    load_metadata,
+    save_annotation,
+    session_dir,
+    write_summary_files,
+)
+
+
+HERE = Path(__file__).resolve().parent
+IMAGE_REF_RE = re.compile(r"(!\[[^\]]*\]\()((?:\./)?images/[^)\s]+)(\))")
+HTML_IMAGE_SRC_RE = re.compile(r'(<img\b[^>]*\bsrc=["\'])(images/[^"\']+)(["\'])', re.I)
+
+ALLOWED_TAGS = set(bleach.sanitizer.ALLOWED_TAGS).union(
+    {
+        "p",
+        "br",
+        "pre",
+        "code",
+        "hr",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+        "table",
+        "thead",
+        "tbody",
+        "tfoot",
+        "tr",
+        "th",
+        "td",
+        "img",
+        "blockquote",
+        "del",
+    }
+)
+ALLOWED_ATTRIBUTES = {
+    **bleach.sanitizer.ALLOWED_ATTRIBUTES,
+    "a": ["href", "title", "rel", "target"],
+    "img": ["src", "alt", "title"],
+    "th": ["align", "colspan", "rowspan"],
+    "td": ["align", "colspan", "rowspan"],
+}
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Run the OCR annotation web UI.")
+    parser.add_argument("--ocr-root", type=Path, default=DEFAULT_OCR_ROOT)
+    parser.add_argument("--raw-root", type=Path, default=DEFAULT_RAW_ROOT)
+    parser.add_argument(
+        "--session-id", default=None, help="Resume an existing session."
+    )
+    parser.add_argument("--session-name", default="OCR annotation session")
+    parser.add_argument("--annotator", default="anonymous")
+    parser.add_argument(
+        "--queue-mode",
+        choices=["all", "table-candidates", "sample"],
+        default="table-candidates",
+    )
+    parser.add_argument("--sample-size", type=int, default=None)
+    parser.add_argument("--seed", type=int, default=17)
+    parser.add_argument("--limit", type=int, default=None, help="Maximum queued pages.")
+    parser.add_argument(
+        "--limit-reports",
+        type=int,
+        default=None,
+        help="Read only the first N reports before queue selection.",
+    )
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=5050)
+    parser.add_argument("--debug", action="store_true")
+    return parser
+
+
+def prepare_session(args: argparse.Namespace) -> str:
+    if args.session_id:
+        metadata = load_metadata(args.session_id)
+        return metadata["session_id"]
+
+    queue, index_summary = build_queue(
+        ocr_root=args.ocr_root,
+        raw_root=args.raw_root,
+        queue_mode=args.queue_mode,
+        sample_size=args.sample_size,
+        seed=args.seed,
+        limit=args.limit,
+        limit_reports=args.limit_reports,
+    )
+    config = {
+        "ocr_root": str(args.ocr_root),
+        "raw_root": str(args.raw_root),
+        "queue_mode": args.queue_mode,
+        "sample_size": args.sample_size,
+        "seed": args.seed,
+        "limit": args.limit,
+        "limit_reports": args.limit_reports,
+    }
+    metadata = create_session(
+        session_name=args.session_name,
+        annotator=args.annotator,
+        manifest_items=[item.to_manifest_record() for item in queue],
+        index_summary=index_summary,
+        config=config,
+    )
+    return metadata["session_id"]
+
+
+@lru_cache(maxsize=64)
+def cached_pages(mmd_path: str) -> tuple[str, ...]:
+    return tuple(load_pages(Path(mmd_path)))
+
+
+def get_item_or_404(session_id: str, index: int) -> dict[str, Any]:
+    manifest = load_manifest(session_id)
+    if index < 0 or index >= len(manifest):
+        abort(404, description="item index out of range")
+    return manifest[index]
+
+
+def item_page_text(item: dict[str, Any]) -> str:
+    pages = cached_pages(item["mmd_path"])
+    page_index = int(item.get("page_index", 0))
+    if page_index < 0 or page_index >= len(pages):
+        return ""
+    return pages[page_index]
+
+
+def rewrite_markdown_image_refs(markdown_text: str, session_id: str, index: int) -> str:
+    def replace_md(match: re.Match[str]) -> str:
+        rel_path = match.group(2).lstrip("./")
+        src = f"/api/session/{session_id}/item/{index}/inline-image/{rel_path}"
+        return f"{match.group(1)}{src}{match.group(3)}"
+
+    return IMAGE_REF_RE.sub(replace_md, markdown_text)
+
+
+def rewrite_html_image_refs(html: str, session_id: str, index: int) -> str:
+    def replace_html(match: re.Match[str]) -> str:
+        rel_path = match.group(2).lstrip("./")
+        src = f"/api/session/{session_id}/item/{index}/inline-image/{rel_path}"
+        return f"{match.group(1)}{src}{match.group(3)}"
+
+    return HTML_IMAGE_SRC_RE.sub(replace_html, html)
+
+
+def render_markdown_page(markdown_text: str, session_id: str, index: int) -> str:
+    rewritten = rewrite_markdown_image_refs(markdown_text, session_id, index)
+    html = markdown_lib.markdown(
+        rewritten,
+        extensions=["tables", "fenced_code", "sane_lists", "nl2br"],
+        output_format="html5",
+    )
+    html = rewrite_html_image_refs(html, session_id, index)
+    return bleach.clean(
+        html,
+        tags=ALLOWED_TAGS,
+        attributes=ALLOWED_ATTRIBUTES,
+        protocols=["http", "https", "mailto", "data"],
+    )
+
+
+def safe_child_path(root: Path, relative_path: str) -> Path:
+    candidate = Path(relative_path)
+    if candidate.is_absolute() or ".." in candidate.parts:
+        abort(400, description="unsafe path")
+    resolved_root = root.resolve()
+    target = (resolved_root / candidate).resolve()
+    if not target.is_relative_to(resolved_root):
+        abort(400, description="unsafe path")
+    return target
+
+
+def progress_payload(session_id: str) -> dict[str, Any]:
+    metadata = load_metadata(session_id)
+    manifest = load_manifest(session_id)
+    current = load_current_annotations(session_id)
+    status_counts: dict[str, int] = {}
+    for item in manifest:
+        status = current.get(item["item_id"], {}).get("overall_status", "unreviewed")
+        status_counts[status] = status_counts.get(status, 0) + 1
+
+    next_unreviewed_index = None
+    for index, item in enumerate(manifest):
+        if item["item_id"] not in current:
+            next_unreviewed_index = index
+            break
+
+    return {
+        "metadata": metadata,
+        "item_count": len(manifest),
+        "reviewed_count": len(current),
+        "status_counts": status_counts,
+        "next_unreviewed_index": next_unreviewed_index,
+    }
+
+
+def create_app(default_session_id: str, build_defaults: dict[str, Any]) -> Flask:
+    app = Flask(__name__, template_folder="templates", static_folder="static")
+    app.config["DEFAULT_SESSION_ID"] = default_session_id
+    app.config["BUILD_DEFAULTS"] = build_defaults
+
+    @app.get("/")
+    def index() -> str:
+        return render_template("index.html", default_session_id=default_session_id)
+
+    @app.get("/api/sessions")
+    def api_sessions() -> Any:
+        return jsonify(
+            {"sessions": list_sessions(), "default_session_id": default_session_id}
+        )
+
+    @app.post("/api/sessions")
+    def api_create_session() -> Any:
+        payload = request.get_json(force=True, silent=True) or {}
+        defaults = app.config["BUILD_DEFAULTS"]
+        queue_mode = payload.get("queue_mode") or defaults["queue_mode"]
+        queue, index_summary = build_queue(
+            ocr_root=Path(payload.get("ocr_root") or defaults["ocr_root"]),
+            raw_root=Path(payload.get("raw_root") or defaults["raw_root"]),
+            queue_mode=queue_mode,
+            sample_size=payload.get("sample_size", defaults.get("sample_size")),
+            seed=int(payload.get("seed", defaults["seed"])),
+            limit=payload.get("limit", defaults.get("limit")),
+            limit_reports=payload.get("limit_reports", defaults.get("limit_reports")),
+        )
+        config = {
+            "ocr_root": payload.get("ocr_root") or defaults["ocr_root"],
+            "raw_root": payload.get("raw_root") or defaults["raw_root"],
+            "queue_mode": queue_mode,
+            "sample_size": payload.get("sample_size", defaults.get("sample_size")),
+            "seed": int(payload.get("seed", defaults["seed"])),
+            "limit": payload.get("limit", defaults.get("limit")),
+            "limit_reports": payload.get(
+                "limit_reports", defaults.get("limit_reports")
+            ),
+        }
+        metadata = create_session(
+            session_name=str(payload.get("session_name") or "OCR annotation session"),
+            annotator=str(payload.get("annotator") or "anonymous"),
+            manifest_items=[item.to_manifest_record() for item in queue],
+            index_summary=index_summary,
+            config=config,
+        )
+        return jsonify(
+            {"metadata": metadata, "progress": progress_payload(metadata["session_id"])}
+        )
+
+    @app.get("/api/session/<session_id>")
+    def api_session(session_id: str) -> Any:
+        return jsonify(progress_payload(session_id))
+
+    @app.get("/api/session/<session_id>/item/<int:index>")
+    def api_item(session_id: str, index: int) -> Any:
+        item = get_item_or_404(session_id, index)
+        text = item_page_text(item)
+        annotations = load_current_annotations(session_id)
+        return jsonify(
+            {
+                "index": index,
+                "item_count": len(load_manifest(session_id)),
+                "item": item,
+                "annotation": annotations.get(item["item_id"]),
+                "page_text": text,
+                "markdown_html": render_markdown_page(text, session_id, index),
+                "image_url": f"/api/session/{session_id}/item/{index}/raw-image",
+            }
+        )
+
+    @app.get("/api/session/<session_id>/item/<int:index>/raw-image")
+    def api_raw_image(session_id: str, index: int) -> Any:
+        item = get_item_or_404(session_id, index)
+        raw_png_path = item.get("raw_png_path")
+        if not raw_png_path:
+            abort(404, description="raw page image missing")
+        target = Path(raw_png_path).resolve()
+        raw_root = Path(item.get("raw_root") or "/").resolve()
+        if not target.is_relative_to(raw_root):
+            abort(400, description="raw image outside raw root")
+        if not target.is_file():
+            abort(404, description="raw page image missing")
+        return send_file(target)
+
+    @app.get("/api/session/<session_id>/item/<int:index>/inline-image/<path:rel_path>")
+    def api_inline_image(session_id: str, index: int, rel_path: str) -> Any:
+        item = get_item_or_404(session_id, index)
+        report_dir = Path(item["report_dir"])
+        target = safe_child_path(report_dir, rel_path)
+        if not target.is_file():
+            abort(404, description="inline OCR image missing")
+        return send_file(target)
+
+    @app.post("/api/session/<session_id>/annotation")
+    def api_save_annotation(session_id: str) -> Any:
+        payload = request.get_json(force=True, silent=False) or {}
+        item_id = payload.get("item_id")
+        if not item_id:
+            abort(400, description="missing item_id")
+        record = save_annotation(
+            session_id=session_id, item_id=str(item_id), payload=payload
+        )
+        return jsonify({"annotation": record, "progress": progress_payload(session_id)})
+
+    @app.get("/api/session/<session_id>/progress")
+    def api_progress(session_id: str) -> Any:
+        return jsonify(progress_payload(session_id))
+
+    @app.post("/api/session/<session_id>/summarize")
+    def api_summarize(session_id: str) -> Any:
+        paths = write_summary_files(session_id)
+        return jsonify({"paths": paths, "progress": progress_payload(session_id)})
+
+    @app.get("/api/session/<session_id>/summary.csv")
+    def api_summary_csv(session_id: str) -> Any:
+        write_summary_files(session_id)
+        return send_file(session_dir(session_id) / "summary.csv", as_attachment=True)
+
+    @app.get("/api/session/<session_id>/summary.md")
+    def api_summary_md(session_id: str) -> Any:
+        write_summary_files(session_id)
+        return send_file(session_dir(session_id) / "summary.md", as_attachment=True)
+
+    return app
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_arg_parser().parse_args(argv)
+    session_id = prepare_session(args)
+    build_defaults = {
+        "ocr_root": str(args.ocr_root),
+        "raw_root": str(args.raw_root),
+        "queue_mode": args.queue_mode,
+        "sample_size": args.sample_size,
+        "seed": args.seed,
+        "limit": args.limit,
+        "limit_reports": args.limit_reports,
+    }
+    app = create_app(session_id, build_defaults)
+    print(f"Annotation session: {session_id}")
+    print(f"Open: http://{args.host}:{args.port}")
+    app.run(host=args.host, port=args.port, debug=args.debug)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/annotation_OCR/static/app.js b/annotation_OCR/static/app.js
new file mode 100644
index 0000000..fd49ccc
--- /dev/null
+++ b/annotation_OCR/static/app.js
@@ -0,0 +1,352 @@
+const state = {
+    sessionId: window.OCR_ANNOTATION_DEFAULT_SESSION_ID,
+    index: 0,
+    itemCount: 0,
+    item: null,
+    overallStatus: 'unreviewed',
+    startedAt: null,
+    zoom: 1,
+    showingRaw: false,
+    saving: false,
+};
+
+const els = {
+    sessionTitle: document.getElementById('sessionTitle'),
+    sessionMeta: document.getElementById('sessionMeta'),
+    progressText: document.getElementById('progressText'),
+    progressBar: document.getElementById('progressBar'),
+    prevButton: document.getElementById('prevButton'),
+    nextButton: document.getElementById('nextButton'),
+    skipReviewedButton: document.getElementById('skipReviewedButton'),
+    helpButton: document.getElementById('helpButton'),
+    rawImage: document.getElementById('rawImage'),
+    imageMissing: document.getElementById('imageMissing'),
+    imageSubtitle: document.getElementById('imageSubtitle'),
+    markdownSubtitle: document.getElementById('markdownSubtitle'),
+    markdownPreview: document.getElementById('markdownPreview'),
+    rawMarkdown: document.getElementById('rawMarkdown'),
+    toggleRawButton: document.getElementById('toggleRawButton'),
+    zoomOutButton: document.getElementById('zoomOutButton'),
+    zoomResetButton: document.getElementById('zoomResetButton'),
+    zoomInButton: document.getElementById('zoomInButton'),
+    reportName: document.getElementById('reportName'),
+    industryValue: document.getElementById('industryValue'),
+    tickerValue: document.getElementById('tickerValue'),
+    pageValue: document.getElementById('pageValue'),
+    signalsValue: document.getElementById('signalsValue'),
+    mappingValue: document.getElementById('mappingValue'),
+    notesInput: document.getElementById('notesInput'),
+    issueGrid: document.getElementById('issueGrid'),
+    saveButton: document.getElementById('saveButton'),
+    saveStatus: document.getElementById('saveStatus'),
+    summaryCsvLink: document.getElementById('summaryCsvLink'),
+    summaryMdLink: document.getElementById('summaryMdLink'),
+    helpDialog: document.getElementById('helpDialog'),
+};
+
+function apiJson(url, options = {}) {
+    return fetch(url, {
+        headers: { 'Content-Type': 'application/json' },
+        ...options,
+    }).then(async (response) => {
+        if (!response.ok) {
+            const text = await response.text();
+            throw new Error(text || `${response.status} ${response.statusText}`);
+        }
+        return response.json();
+    });
+}
+
+function statusMessage(message, tone = 'neutral') {
+    els.saveStatus.textContent = message;
+    els.saveStatus.dataset.tone = tone;
+}
+
+function formatList(values) {
+    if (!values || values.length === 0) return 'none';
+    return values.join(', ');
+}
+
+function updateProgress(progress) {
+    const metadata = progress.metadata || {};
+    state.itemCount = progress.item_count || 0;
+    els.sessionTitle.textContent = metadata.session_name || metadata.session_id || 'Session';
+    els.sessionMeta.textContent = `${metadata.annotator || 'anonymous'} · ${metadata.session_id || state.sessionId}`;
+    const reviewed = progress.reviewed_count || 0;
+    const total = progress.item_count || 0;
+    els.progressText.textContent = `${reviewed} / ${total} reviewed`;
+    const pct = total ? Math.round((reviewed / total) * 100) : 0;
+    els.progressBar.style.width = `${pct}%`;
+    els.summaryCsvLink.href = `/api/session/${state.sessionId}/summary.csv`;
+    els.summaryMdLink.href = `/api/session/${state.sessionId}/summary.md`;
+}
+
+function setOverall(status) {
+    state.overallStatus = status;
+    document.querySelectorAll('.status-button').forEach((button) => {
+        button.classList.toggle('active', button.dataset.status === status);
+    });
+}
+
+function setSubchecks(values = {}) {
+    document.querySelectorAll('[data-subcheck]').forEach((select) => {
+        select.value = values[select.dataset.subcheck] || 'unreviewed';
+    });
+}
+
+function setIssues(values = []) {
+    const selected = new Set(values);
+    els.issueGrid.querySelectorAll('input[type="checkbox"]').forEach((checkbox) => {
+        checkbox.checked = selected.has(checkbox.value);
+    });
+}
+
+function getSubchecks() {
+    const subchecks = {};
+    document.querySelectorAll('[data-subcheck]').forEach((select) => {
+        subchecks[select.dataset.subcheck] = select.value;
+    });
+    return subchecks;
+}
+
+function getIssues() {
+    return Array.from(els.issueGrid.querySelectorAll('input[type="checkbox"]:checked'))
+        .map((checkbox) => checkbox.value)
+        .sort();
+}
+
+function loadAnnotation(annotation) {
+    setOverall(annotation?.overall_status || 'unreviewed');
+    setSubchecks(annotation?.subchecks || {});
+    setIssues(annotation?.issue_tags || []);
+    els.notesInput.value = annotation?.notes || '';
+}
+
+function applyZoom() {
+    els.rawImage.style.transform = `scale(${state.zoom})`;
+    els.rawImage.style.marginBottom = `${Math.max(0, (state.zoom - 1) * 100)}%`;
+    els.zoomResetButton.textContent = `${Math.round(state.zoom * 100)}%`;
+}
+
+function setZoom(value) {
+    state.zoom = Math.min(3, Math.max(0.35, value));
+    applyZoom();
+}
+
+async function loadProgress() {
+    const progress = await apiJson(`/api/session/${state.sessionId}`);
+    updateProgress(progress);
+    return progress;
+}
+
+async function loadItem(index) {
+    const safeIndex = Math.max(0, Math.min(index, Math.max(0, state.itemCount - 1)));
+    const data = await apiJson(`/api/session/${state.sessionId}/item/${safeIndex}`);
+    state.index = safeIndex;
+    state.item = data.item;
+    state.itemCount = data.item_count;
+    state.startedAt = new Date();
+
+    els.reportName.textContent = data.item.report_name;
+    els.industryValue.textContent = data.item.industry_slug;
+    els.tickerValue.textContent = `${data.item.exchange}:${data.item.ticker} · ${data.item.year}`;
+    els.pageValue.textContent = `${data.item.page_number} / ${data.item.mmd_page_count}`;
+    els.signalsValue.textContent = formatList(data.item.candidate_reasons);
+    els.mappingValue.textContent = [data.item.mapping_status, ...data.item.mapping_warnings].filter(Boolean).join(' · ');
+    els.imageSubtitle.textContent = data.item.raw_png_path || 'No raw image path';
+    els.markdownSubtitle.textContent = `${data.item.page_text_chars} chars · ${data.item.page_text_sha256.slice(0, 12)}`;
+
+    els.markdownPreview.innerHTML = data.markdown_html || '';
+    els.rawMarkdown.textContent = data.page_text || '';
+
+    if (data.item.raw_png_path) {
+        els.rawImage.hidden = false;
+        els.imageMissing.hidden = true;
+        els.rawImage.src = `${data.image_url}?v=${encodeURIComponent(data.item.page_text_sha256)}`;
+    } else {
+        els.rawImage.hidden = true;
+        els.imageMissing.hidden = false;
+        els.rawImage.removeAttribute('src');
+    }
+
+    loadAnnotation(data.annotation);
+    setZoom(1);
+    statusMessage(`Loaded item ${safeIndex + 1} of ${data.item_count}`);
+    els.prevButton.disabled = safeIndex === 0;
+    els.nextButton.disabled = safeIndex >= data.item_count - 1;
+}
+
+function annotationPayload(source = 'manual') {
+    return {
+        item_id: state.item.item_id,
+        overall_status: state.overallStatus,
+        subchecks: getSubchecks(),
+        issue_tags: getIssues(),
+        notes: els.notesInput.value,
+        annotation_source: source,
+        review_duration_ms: state.startedAt ? new Date() - state.startedAt : null,
+        client_started_at_utc: state.startedAt ? state.startedAt.toISOString() : null,
+        client_updated_at_utc: new Date().toISOString(),
+    };
+}
+
+async function saveAnnotation(source = 'manual', advance = false) {
+    if (!state.item || state.saving) return;
+    state.saving = true;
+    els.saveButton.disabled = true;
+    statusMessage('Saving...');
+    try {
+        const data = await apiJson(`/api/session/${state.sessionId}/annotation`, {
+            method: 'POST',
+            body: JSON.stringify(annotationPayload(source)),
+        });
+        updateProgress(data.progress);
+        statusMessage('Saved', 'ok');
+        if (advance && state.index < state.itemCount - 1) {
+            await loadItem(state.index + 1);
+            await loadProgress();
+        }
+    } catch (error) {
+        statusMessage(`Save failed: ${error.message}`, 'error');
+    } finally {
+        state.saving = false;
+        els.saveButton.disabled = false;
+    }
+}
+
+function quickMark(status) {
+    setOverall(status);
+    if (status === 'ok') {
+        setSubchecks({
+            text_content: 'ok',
+            table_content: 'ok',
+            table_structure: 'ok',
+            page_alignment: 'ok',
+        });
+        setIssues([]);
+    } else if (status === 'not_ok') {
+        const subchecks = getSubchecks();
+        if (Object.values(subchecks).every((value) => value === 'unreviewed')) {
+            setSubchecks({
+                text_content: 'uncertain',
+                table_content: 'uncertain',
+                table_structure: 'not_ok',
+                page_alignment: 'uncertain',
+            });
+        }
+    } else if (status === 'uncertain') {
+        setSubchecks({
+            text_content: 'uncertain',
+            table_content: 'uncertain',
+            table_structure: 'uncertain',
+            page_alignment: 'uncertain',
+        });
+    }
+    saveAnnotation(`shortcut:${status}`, true);
+}
+
+function toggleIssue(tag) {
+    const checkbox = els.issueGrid.querySelector(`input[value="${tag}"]`);
+    if (checkbox) checkbox.checked = !checkbox.checked;
+}
+
+async function go(delta) {
+    const target = state.index + delta;
+    if (target < 0 || target >= state.itemCount) return;
+    await loadItem(target);
+    await loadProgress();
+}
+
+async function goNextOpen() {
+    const progress = await loadProgress();
+    if (progress.next_unreviewed_index === null || progress.next_unreviewed_index === undefined) {
+        statusMessage('No open items');
+        return;
+    }
+    await loadItem(progress.next_unreviewed_index);
+}
+
+function toggleRawMarkdown() {
+    state.showingRaw = !state.showingRaw;
+    els.rawMarkdown.hidden = !state.showingRaw;
+    els.markdownPreview.hidden = state.showingRaw;
+    els.toggleRawButton.textContent = state.showingRaw ? 'Rendered' : 'Raw Markdown';
+}
+
+function inputHasFocus() {
+    const active = document.activeElement;
+    return active && ['TEXTAREA', 'INPUT', 'SELECT'].includes(active.tagName);
+}
+
+function setupEvents() {
+    els.prevButton.addEventListener('click', () => go(-1));
+    els.nextButton.addEventListener('click', () => go(1));
+    els.skipReviewedButton.addEventListener('click', goNextOpen);
+    els.saveButton.addEventListener('click', () => saveAnnotation('manual', false));
+    els.toggleRawButton.addEventListener('click', toggleRawMarkdown);
+    els.zoomOutButton.addEventListener('click', () => setZoom(state.zoom - 0.15));
+    els.zoomInButton.addEventListener('click', () => setZoom(state.zoom + 0.15));
+    els.zoomResetButton.addEventListener('click', () => setZoom(1));
+    els.helpButton.addEventListener('click', () => els.helpDialog.showModal());
+    document.querySelectorAll('.status-button').forEach((button) => {
+        button.addEventListener('click', () => setOverall(button.dataset.status));
+    });
+
+    document.addEventListener('keydown', (event) => {
+        if (inputHasFocus()) return;
+        if (event.key === '?') {
+            event.preventDefault();
+            els.helpDialog.showModal();
+        } else if (event.key.toLowerCase() === 'a') {
+            event.preventDefault();
+            quickMark('ok');
+        } else if (event.key.toLowerCase() === 'r') {
+            event.preventDefault();
+            quickMark('not_ok');
+        } else if (event.key.toLowerCase() === 'u') {
+            event.preventDefault();
+            quickMark('uncertain');
+        } else if (event.key === 'ArrowRight' || event.key.toLowerCase() === 'j') {
+            event.preventDefault();
+            go(1);
+        } else if (event.key === 'ArrowLeft' || event.key.toLowerCase() === 'k') {
+            event.preventDefault();
+            go(-1);
+        } else if (event.key.toLowerCase() === 't') {
+            event.preventDefault();
+            toggleIssue('broken_table');
+        } else if (event.key.toLowerCase() === 'c') {
+            event.preventDefault();
+            toggleIssue('merged_columns');
+        } else if (event.key.toLowerCase() === 'm') {
+            event.preventDefault();
+            toggleIssue('missing_text');
+        } else if (event.key === '+' || event.key === '=') {
+            event.preventDefault();
+            setZoom(state.zoom + 0.15);
+        } else if (event.key === '-') {
+            event.preventDefault();
+            setZoom(state.zoom - 0.15);
+        } else if (event.key === '0') {
+            event.preventDefault();
+            setZoom(1);
+        }
+    });
+}
+
+async function init() {
+    setupEvents();
+    try {
+        const progress = await loadProgress();
+        const startIndex = progress.next_unreviewed_index ?? 0;
+        if (progress.item_count > 0) {
+            await loadItem(startIndex);
+        } else {
+            statusMessage('Session has no queued items', 'error');
+        }
+    } catch (error) {
+        statusMessage(`Startup failed: ${error.message}`, 'error');
+    }
+}
+
+init();
\ No newline at end of file
diff --git a/annotation_OCR/static/style.css b/annotation_OCR/static/style.css
new file mode 100644
index 0000000..deaa160
--- /dev/null
+++ b/annotation_OCR/static/style.css
@@ -0,0 +1,448 @@
+:root {
+    --bg: #edf1f2;
+    --panel: #fbfcfa;
+    --panel-2: #f5f7f4;
+    --ink: #1d2528;
+    --muted: #5b686d;
+    --line: #cdd7d8;
+    --teal: #08746f;
+    --teal-dark: #075854;
+    --red: #aa3d2d;
+    --amber: #a06010;
+    --green: #2d7434;
+    --shadow: 0 18px 45px rgba(31, 45, 49, 0.14);
+    --mono: "JetBrains Mono", "IBM Plex Mono", "Cascadia Mono", monospace;
+    --sans: "Aptos", "Source Sans 3", "Segoe UI", sans-serif;
+}
+
+* {
+    box-sizing: border-box;
+}
+
+body {
+    margin: 0;
+    min-height: 100vh;
+    background:
+        linear-gradient(135deg, rgba(8, 116, 111, 0.09), transparent 34%),
+        linear-gradient(315deg, rgba(170, 61, 45, 0.08), transparent 36%),
+        var(--bg);
+    color: var(--ink);
+    font-family: var(--sans);
+}
+
+button,
+select,
+textarea {
+    font: inherit;
+}
+
+button,
+.secondary-link {
+    border: 1px solid var(--line);
+    background: var(--panel);
+    color: var(--ink);
+    min-height: 36px;
+    padding: 0 12px;
+    border-radius: 6px;
+    cursor: pointer;
+    text-decoration: none;
+    display: inline-flex;
+    align-items: center;
+    justify-content: center;
+    white-space: nowrap;
+}
+
+button:hover,
+.secondary-link:hover {
+    border-color: var(--teal);
+}
+
+.topbar {
+    position: sticky;
+    top: 0;
+    z-index: 20;
+    display: grid;
+    grid-template-columns: minmax(280px, 1fr) minmax(260px, 420px) auto;
+    gap: 18px;
+    align-items: center;
+    padding: 14px 18px;
+    background: rgba(251, 252, 250, 0.94);
+    border-bottom: 1px solid var(--line);
+    backdrop-filter: blur(14px);
+}
+
+.eyebrow,
+.section-label {
+    color: var(--muted);
+    font-size: 11px;
+    font-weight: 700;
+    letter-spacing: 0;
+    text-transform: uppercase;
+}
+
+.session-title {
+    font-size: 18px;
+    font-weight: 800;
+}
+
+.session-meta,
+.pane-subtitle,
+.save-status {
+    color: var(--muted);
+    font-size: 12px;
+}
+
+.progress-block {
+    display: grid;
+    gap: 7px;
+}
+
+.progress-track {
+    width: 100%;
+    height: 8px;
+    overflow: hidden;
+    background: #dce3e4;
+    border-radius: 999px;
+}
+
+.progress-track div {
+    width: 0%;
+    height: 100%;
+    background: linear-gradient(90deg, var(--teal), #6a8d28);
+    transition: width 160ms ease;
+}
+
+.nav-actions,
+.zoom-actions,
+.panel-actions {
+    display: flex;
+    gap: 8px;
+    align-items: center;
+}
+
+.icon-button {
+    width: 36px;
+    padding: 0;
+    font-weight: 800;
+}
+
+.workspace {
+    display: grid;
+    grid-template-columns: minmax(340px, 1.05fr) minmax(340px, 1fr) 340px;
+    gap: 14px;
+    padding: 14px;
+    height: calc(100vh - 82px);
+}
+
+.pane,
+.annotation-panel {
+    min-height: 0;
+    background: var(--panel);
+    border: 1px solid var(--line);
+    border-radius: 8px;
+    box-shadow: var(--shadow);
+}
+
+.pane {
+    display: grid;
+    grid-template-rows: auto minmax(0, 1fr);
+    overflow: hidden;
+}
+
+.pane-toolbar {
+    display: flex;
+    justify-content: space-between;
+    gap: 12px;
+    align-items: center;
+    padding: 12px;
+    border-bottom: 1px solid var(--line);
+    background: var(--panel-2);
+}
+
+.pane-title {
+    font-size: 15px;
+    font-weight: 800;
+}
+
+.image-stage {
+    position: relative;
+    overflow: auto;
+    display: grid;
+    place-items: start center;
+    padding: 16px;
+    background:
+        linear-gradient(45deg, #dce3e4 25%, transparent 25%),
+        linear-gradient(-45deg, #dce3e4 25%, transparent 25%),
+        linear-gradient(45deg, transparent 75%, #dce3e4 75%),
+        linear-gradient(-45deg, transparent 75%, #dce3e4 75%);
+    background-size: 22px 22px;
+    background-position: 0 0, 0 11px, 11px -11px, -11px 0;
+}
+
+#rawImage {
+    display: block;
+    max-width: none;
+    width: min(100%, 900px);
+    transform-origin: top center;
+    border: 1px solid #b7c3c5;
+    background: white;
+    box-shadow: 0 12px 28px rgba(31, 45, 49, 0.18);
+}
+
+.missing-state {
+    margin: 40px auto;
+    padding: 20px;
+    border: 1px dashed var(--red);
+    background: #fff7f3;
+    color: var(--red);
+    border-radius: 8px;
+}
+
+.markdown-preview,
+.raw-markdown {
+    overflow: auto;
+    margin: 0;
+    padding: 18px;
+}
+
+.markdown-preview {
+    line-height: 1.48;
+}
+
+.markdown-preview h1,
+.markdown-preview h2,
+.markdown-preview h3 {
+    margin: 1.2em 0 0.45em;
+    line-height: 1.15;
+}
+
+.markdown-preview table {
+    width: max-content;
+    max-width: 100%;
+    border-collapse: collapse;
+    margin: 14px 0;
+    font-size: 13px;
+}
+
+.markdown-preview th,
+.markdown-preview td {
+    border: 1px solid #b9c4c6;
+    padding: 6px 8px;
+    vertical-align: top;
+}
+
+.markdown-preview th {
+    background: #e3eceb;
+}
+
+.markdown-preview img {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--line);
+}
+
+.raw-markdown {
+    font-family: var(--mono);
+    font-size: 12px;
+    line-height: 1.45;
+    white-space: pre-wrap;
+    background: #172225;
+    color: #e7eeed;
+}
+
+.annotation-panel {
+    display: flex;
+    flex-direction: column;
+    overflow: auto;
+    padding: 12px;
+    gap: 12px;
+}
+
+.panel-section {
+    display: grid;
+    gap: 10px;
+    padding-bottom: 12px;
+    border-bottom: 1px solid var(--line);
+}
+
+.report-card h1 {
+    margin: 0;
+    font-size: 18px;
+    line-height: 1.2;
+}
+
+dl {
+    display: grid;
+    gap: 7px;
+    margin: 0;
+}
+
+dl div {
+    display: grid;
+    grid-template-columns: 78px minmax(0, 1fr);
+    gap: 8px;
+}
+
+dt {
+    color: var(--muted);
+    font-size: 12px;
+}
+
+dd {
+    margin: 0;
+    min-width: 0;
+    overflow-wrap: anywhere;
+    font-size: 12px;
+}
+
+.decision-buttons {
+    display: grid;
+    grid-template-columns: repeat(3, 1fr);
+    gap: 8px;
+}
+
+.status-button[data-status="ok"].active {
+    background: var(--green);
+    border-color: var(--green);
+    color: white;
+}
+
+.status-button[data-status="not_ok"].active {
+    background: var(--red);
+    border-color: var(--red);
+    color: white;
+}
+
+.status-button[data-status="uncertain"].active {
+    background: var(--amber);
+    border-color: var(--amber);
+    color: white;
+}
+
+.subchecks-section label {
+    display: grid;
+    grid-template-columns: 1fr 140px;
+    gap: 8px;
+    align-items: center;
+    font-size: 13px;
+}
+
+select,
+textarea {
+    width: 100%;
+    border: 1px solid var(--line);
+    border-radius: 6px;
+    background: white;
+    color: var(--ink);
+}
+
+select {
+    min-height: 34px;
+}
+
+textarea {
+    resize: vertical;
+    padding: 8px;
+}
+
+.issue-grid {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 8px;
+}
+
+.issue-grid label {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    font-size: 12px;
+}
+
+.primary-button {
+    background: var(--teal);
+    border-color: var(--teal);
+    color: white;
+    font-weight: 800;
+    flex: 1;
+}
+
+.primary-button:hover {
+    background: var(--teal-dark);
+    border-color: var(--teal-dark);
+}
+
+.save-status {
+    min-height: 20px;
+}
+
+.help-dialog {
+    width: min(520px, calc(100vw - 32px));
+    border: 1px solid var(--line);
+    border-radius: 8px;
+    box-shadow: var(--shadow);
+}
+
+.dialog-header {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 12px;
+}
+
+.dialog-header h2 {
+    margin: 0 0 12px;
+}
+
+.shortcut-grid {
+    display: grid;
+    grid-template-columns: 90px minmax(0, 1fr);
+    gap: 8px 14px;
+}
+
+.shortcut-grid span {
+    font-family: var(--mono);
+    font-weight: 800;
+}
+
+.shortcut-grid p {
+    margin: 0;
+}
+
+@media (max-width: 1180px) {
+    .topbar {
+        grid-template-columns: 1fr;
+    }
+
+    .workspace {
+        height: auto;
+        min-height: calc(100vh - 82px);
+        grid-template-columns: 1fr;
+    }
+
+    .pane {
+        min-height: 72vh;
+    }
+
+    .annotation-panel {
+        min-height: 0;
+    }
+}
+
+@media (max-width: 620px) {
+
+    .nav-actions,
+    .pane-toolbar,
+    .panel-actions {
+        flex-wrap: wrap;
+    }
+
+    .decision-buttons,
+    .issue-grid {
+        grid-template-columns: 1fr;
+    }
+
+    .subchecks-section label,
+    dl div {
+        grid-template-columns: 1fr;
+    }
+}
\ No newline at end of file
diff --git a/annotation_OCR/store.py b/annotation_OCR/store.py
new file mode 100644
index 0000000..e2cbe3a
--- /dev/null
+++ b/annotation_OCR/store.py
@@ -0,0 +1,400 @@
+"""File-backed session storage for OCR annotation runs."""
+
+from __future__ import annotations
+
+import csv
+import json
+import re
+import uuid
+from collections import Counter
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+
+HERE = Path(__file__).resolve().parent
+SESSIONS_DIR = HERE / "sessions"
+SCHEMA_VERSION = "1.0"
+
+VALID_OVERALL_STATUS = {"ok", "not_ok", "uncertain", "unreviewed"}
+VALID_SUBCHECK_STATUS = {"ok", "not_ok", "uncertain", "not_applicable", "unreviewed"}
+
+SUMMARY_FIELDS = [
+    "session_id",
+    "session_name",
+    "annotator",
+    "item_id",
+    "industry_slug",
+    "report_name",
+    "exchange",
+    "ticker",
+    "year",
+    "page_index",
+    "page_number",
+    "overall_status",
+    "text_content",
+    "table_content",
+    "table_structure",
+    "page_alignment",
+    "issue_tags",
+    "notes",
+    "updated_at_utc",
+    "annotation_source",
+    "review_duration_ms",
+    "mapping_status",
+    "mapping_warnings",
+    "candidate_reasons",
+    "page_text_sha256",
+    "raw_png_path",
+    "mmd_path",
+]
+
+
+def utc_now() -> str:
+    return datetime.now(timezone.utc).isoformat(timespec="seconds")
+
+
+def session_slug(value: str) -> str:
+    slug = re.sub(r"[^A-Za-z0-9_.-]+", "-", value.strip()).strip("-._")
+    return slug[:48] or "session"
+
+
+def new_session_id(session_name: str | None = None) -> str:
+    prefix = session_slug(session_name or "session")[:24]
+    return f"{prefix}-{uuid.uuid4().hex[:12]}"
+
+
+def atomic_write_text(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    tmp.write_text(text, encoding="utf-8")
+    tmp.replace(path)
+
+
+def atomic_write_json(path: Path, payload: Any) -> None:
+    atomic_write_text(path, json.dumps(payload, indent=2, ensure_ascii=False))
+
+
+def session_dir(session_id: str) -> Path:
+    return SESSIONS_DIR / session_id
+
+
+def metadata_path(session_id: str) -> Path:
+    return session_dir(session_id) / "metadata.json"
+
+
+def manifest_path(session_id: str) -> Path:
+    return session_dir(session_id) / "manifest.json"
+
+
+def current_annotations_path(session_id: str) -> Path:
+    return session_dir(session_id) / "current_annotations.json"
+
+
+def annotations_log_path(session_id: str) -> Path:
+    return session_dir(session_id) / "annotations.jsonl"
+
+
+def create_session(
+    *,
+    session_name: str,
+    annotator: str,
+    manifest_items: list[dict[str, Any]],
+    index_summary: dict[str, Any],
+    config: dict[str, Any],
+    session_id: str | None = None,
+) -> dict[str, Any]:
+    sid = session_id or new_session_id(session_name)
+    directory = session_dir(sid)
+    if directory.exists():
+        raise FileExistsError(f"session already exists: {sid}")
+    directory.mkdir(parents=True, exist_ok=False)
+
+    now = utc_now()
+    metadata = {
+        "schema_version": SCHEMA_VERSION,
+        "session_id": sid,
+        "session_name": session_name,
+        "annotator": annotator,
+        "created_at_utc": now,
+        "updated_at_utc": now,
+        "status": "active",
+        "item_count": len(manifest_items),
+        "completed_count": 0,
+        "index_summary": index_summary,
+        "config": config,
+    }
+    manifest = {
+        "schema_version": SCHEMA_VERSION,
+        "session_id": sid,
+        "created_at_utc": now,
+        "item_count": len(manifest_items),
+        "items": manifest_items,
+    }
+
+    atomic_write_json(metadata_path(sid), metadata)
+    atomic_write_json(manifest_path(sid), manifest)
+    atomic_write_json(current_annotations_path(sid), {})
+    annotations_log_path(sid).touch()
+    write_summary_files(sid)
+    return metadata
+
+
+def load_json(path: Path, default: Any | None = None) -> Any:
+    if not path.is_file():
+        return default
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def load_metadata(session_id: str) -> dict[str, Any]:
+    metadata = load_json(metadata_path(session_id))
+    if metadata is None:
+        raise FileNotFoundError(f"unknown session: {session_id}")
+    return metadata
+
+
+def load_manifest(session_id: str) -> list[dict[str, Any]]:
+    manifest = load_json(manifest_path(session_id))
+    if manifest is None:
+        raise FileNotFoundError(f"unknown session manifest: {session_id}")
+    return manifest.get("items", [])
+
+
+def load_current_annotations(session_id: str) -> dict[str, dict[str, Any]]:
+    return load_json(current_annotations_path(session_id), default={}) or {}
+
+
+def list_sessions() -> list[dict[str, Any]]:
+    if not SESSIONS_DIR.is_dir():
+        return []
+    sessions: list[dict[str, Any]] = []
+    for path in sorted(SESSIONS_DIR.iterdir()):
+        if not path.is_dir():
+            continue
+        metadata = load_json(path / "metadata.json")
+        if isinstance(metadata, dict):
+            sessions.append(metadata)
+    sessions.sort(key=lambda rec: rec.get("updated_at_utc", ""), reverse=True)
+    return sessions
+
+
+def manifest_index(session_id: str) -> dict[str, dict[str, Any]]:
+    return {item["item_id"]: item for item in load_manifest(session_id)}
+
+
+def sanitize_status(value: Any, valid: set[str], default: str) -> str:
+    if isinstance(value, str) and value in valid:
+        return value
+    return default
+
+
+def normalize_annotation_payload(payload: dict[str, Any]) -> dict[str, Any]:
+    subchecks = (
+        payload.get("subchecks") if isinstance(payload.get("subchecks"), dict) else {}
+    )
+    normalized_subchecks = {
+        "text_content": sanitize_status(
+            subchecks.get("text_content"), VALID_SUBCHECK_STATUS, "unreviewed"
+        ),
+        "table_content": sanitize_status(
+            subchecks.get("table_content"), VALID_SUBCHECK_STATUS, "unreviewed"
+        ),
+        "table_structure": sanitize_status(
+            subchecks.get("table_structure"), VALID_SUBCHECK_STATUS, "unreviewed"
+        ),
+        "page_alignment": sanitize_status(
+            subchecks.get("page_alignment"), VALID_SUBCHECK_STATUS, "unreviewed"
+        ),
+    }
+
+    issue_tags = payload.get("issue_tags")
+    if not isinstance(issue_tags, list):
+        issue_tags = []
+    issue_tags = sorted({str(tag).strip() for tag in issue_tags if str(tag).strip()})
+
+    return {
+        "overall_status": sanitize_status(
+            payload.get("overall_status"), VALID_OVERALL_STATUS, "unreviewed"
+        ),
+        "subchecks": normalized_subchecks,
+        "issue_tags": issue_tags,
+        "notes": str(payload.get("notes") or "").strip(),
+        "annotation_source": str(payload.get("annotation_source") or "manual"),
+        "review_duration_ms": payload.get("review_duration_ms"),
+        "client_started_at_utc": payload.get("client_started_at_utc"),
+        "client_updated_at_utc": payload.get("client_updated_at_utc"),
+    }
+
+
+def next_log_sequence(path: Path) -> int:
+    if not path.is_file():
+        return 1
+    with path.open(encoding="utf-8") as handle:
+        return sum(1 for line in handle if line.strip()) + 1
+
+
+def save_annotation(
+    *,
+    session_id: str,
+    item_id: str,
+    payload: dict[str, Any],
+) -> dict[str, Any]:
+    metadata = load_metadata(session_id)
+    items = manifest_index(session_id)
+    item = items.get(item_id)
+    if item is None:
+        raise KeyError(f"item not in session manifest: {item_id}")
+
+    normalized = normalize_annotation_payload(payload)
+    now = utc_now()
+    log_path = annotations_log_path(session_id)
+    record = {
+        "schema_version": SCHEMA_VERSION,
+        "sequence": next_log_sequence(log_path),
+        "session_id": session_id,
+        "session_name": metadata.get("session_name"),
+        "annotator": metadata.get("annotator"),
+        "created_at_utc": now,
+        "updated_at_utc": now,
+        "item_id": item_id,
+        "industry_slug": item.get("industry_slug"),
+        "report_name": item.get("report_name"),
+        "exchange": item.get("exchange"),
+        "ticker": item.get("ticker"),
+        "year": item.get("year"),
+        "page_index": item.get("page_index"),
+        "page_number": item.get("page_number"),
+        "mmd_path": item.get("mmd_path"),
+        "raw_png_path": item.get("raw_png_path"),
+        "mapping_status": item.get("mapping_status"),
+        "mapping_warnings": item.get("mapping_warnings", []),
+        "candidate_reasons": item.get("candidate_reasons", []),
+        "page_text_sha256": item.get("page_text_sha256"),
+        **normalized,
+    }
+
+    with log_path.open("a", encoding="utf-8") as handle:
+        handle.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+    current = load_current_annotations(session_id)
+    current[item_id] = record
+    atomic_write_json(current_annotations_path(session_id), current)
+
+    completed_count = sum(
+        1 for rec in current.values() if rec.get("overall_status") != "unreviewed"
+    )
+    metadata["updated_at_utc"] = now
+    metadata["completed_count"] = completed_count
+    metadata["item_count"] = len(items)
+    atomic_write_json(metadata_path(session_id), metadata)
+    write_summary_files(session_id)
+    return record
+
+
+def summary_rows(session_id: str) -> list[dict[str, Any]]:
+    metadata = load_metadata(session_id)
+    current = load_current_annotations(session_id)
+    rows: list[dict[str, Any]] = []
+    for item in load_manifest(session_id):
+        annotation = current.get(item["item_id"], {})
+        subchecks = annotation.get("subchecks", {}) if annotation else {}
+        rows.append(
+            {
+                "session_id": session_id,
+                "session_name": metadata.get("session_name", ""),
+                "annotator": metadata.get("annotator", ""),
+                "item_id": item.get("item_id"),
+                "industry_slug": item.get("industry_slug"),
+                "report_name": item.get("report_name"),
+                "exchange": item.get("exchange"),
+                "ticker": item.get("ticker"),
+                "year": item.get("year"),
+                "page_index": item.get("page_index"),
+                "page_number": item.get("page_number"),
+                "overall_status": annotation.get("overall_status", "unreviewed"),
+                "text_content": subchecks.get("text_content", "unreviewed"),
+                "table_content": subchecks.get("table_content", "unreviewed"),
+                "table_structure": subchecks.get("table_structure", "unreviewed"),
+                "page_alignment": subchecks.get("page_alignment", "unreviewed"),
+                "issue_tags": ";".join(annotation.get("issue_tags", [])),
+                "notes": annotation.get("notes", ""),
+                "updated_at_utc": annotation.get("updated_at_utc", ""),
+                "annotation_source": annotation.get("annotation_source", ""),
+                "review_duration_ms": annotation.get("review_duration_ms", ""),
+                "mapping_status": item.get("mapping_status"),
+                "mapping_warnings": ";".join(item.get("mapping_warnings", [])),
+                "candidate_reasons": ";".join(item.get("candidate_reasons", [])),
+                "page_text_sha256": item.get("page_text_sha256"),
+                "raw_png_path": item.get("raw_png_path"),
+                "mmd_path": item.get("mmd_path"),
+            }
+        )
+    return rows
+
+
+def write_summary_csv(path: Path, rows: list[dict[str, Any]]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    with tmp.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.DictWriter(
+            handle, fieldnames=SUMMARY_FIELDS, extrasaction="ignore"
+        )
+        writer.writeheader()
+        writer.writerows(rows)
+    tmp.replace(path)
+
+
+def write_summary_md(path: Path, rows: list[dict[str, Any]]) -> None:
+    metadata = load_metadata(path.parent.name)
+    status_counts = Counter(row["overall_status"] for row in rows)
+    issue_counts: Counter[str] = Counter()
+    for row in rows:
+        for tag in str(row.get("issue_tags") or "").split(";"):
+            if tag:
+                issue_counts[tag] += 1
+
+    reviewed = len(rows) - status_counts.get("unreviewed", 0)
+    lines = [
+        f"# OCR Annotation Summary: {metadata.get('session_name', path.parent.name)}",
+        "",
+        f"- Session ID: `{path.parent.name}`",
+        f"- Annotator: `{metadata.get('annotator', '')}`",
+        f"- Items: {len(rows)}",
+        f"- Reviewed: {reviewed}",
+        f"- Updated: {metadata.get('updated_at_utc', '')}",
+        "",
+        "## Status Counts",
+        "",
+        "| Status | Count |",
+        "| --- | ---: |",
+    ]
+    for status, count in sorted(status_counts.items()):
+        lines.append(f"| {status} | {count} |")
+
+    lines.extend(["", "## Issue Counts", "", "| Issue | Count |", "| --- | ---: |"])
+    if issue_counts:
+        for issue, count in issue_counts.most_common():
+            lines.append(f"| {issue} | {count} |")
+    else:
+        lines.append("| none | 0 |")
+
+    atomic_write_text(path, "\n".join(lines) + "\n")
+
+
+def write_summary_files(session_id: str) -> dict[str, str]:
+    rows = summary_rows(session_id)
+    directory = session_dir(session_id)
+    csv_path = directory / "summary.csv"
+    md_path = directory / "summary.md"
+    write_summary_csv(csv_path, rows)
+    write_summary_md(md_path, rows)
+    return {"summary_csv": str(csv_path), "summary_md": str(md_path)}
+
+
+def write_all_sessions_summary(path: Path | None = None) -> Path:
+    out_path = path or (SESSIONS_DIR / "all_sessions_summary.csv")
+    rows: list[dict[str, Any]] = []
+    for metadata in list_sessions():
+        rows.extend(summary_rows(metadata["session_id"]))
+    write_summary_csv(out_path, rows)
+    return out_path
diff --git a/annotation_OCR/summarize.py b/annotation_OCR/summarize.py
new file mode 100644
index 0000000..31d565d
--- /dev/null
+++ b/annotation_OCR/summarize.py
@@ -0,0 +1,58 @@
+"""Regenerate OCR annotation session summaries."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from store import list_sessions, write_all_sessions_summary, write_summary_files
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Regenerate OCR annotation summaries.")
+    parser.add_argument("--session-id", action="append", default=[])
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Regenerate summaries for every session under annotation_OCR/sessions.",
+    )
+    parser.add_argument(
+        "--combined-output",
+        type=Path,
+        default=None,
+        help="Optional path for the combined all-sessions CSV.",
+    )
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_arg_parser().parse_args(argv)
+    session_ids = list(args.session_id)
+    if args.all:
+        session_ids.extend(metadata["session_id"] for metadata in list_sessions())
+
+    seen = set()
+    regenerated = []
+    for session_id in session_ids:
+        if session_id in seen:
+            continue
+        seen.add(session_id)
+        regenerated.append(
+            {"session_id": session_id, **write_summary_files(session_id)}
+        )
+
+    combined = None
+    if args.all or args.combined_output:
+        combined = str(write_all_sessions_summary(args.combined_output))
+
+    print(
+        json.dumps(
+            {"regenerated": regenerated, "combined_summary_csv": combined}, indent=2
+        )
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/annotation_OCR/templates/index.html b/annotation_OCR/templates/index.html
new file mode 100644
index 0000000..2c21e6c
--- /dev/null
+++ b/annotation_OCR/templates/index.html
@@ -0,0 +1,204 @@
+<!doctype html>
+<html lang="en">
+
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>OCR Annotation</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
+    <script>
+        window.OCR_ANNOTATION_DEFAULT_SESSION_ID = {{ default_session_id | tojson }};
+    </script>
+    <script defer src="{{ url_for('static', filename='app.js') }}"></script>
+</head>
+
+<body>
+    <header class="topbar">
+        <div class="session-block">
+            <div class="eyebrow">OCR annotation</div>
+            <div class="session-title" id="sessionTitle">Loading session</div>
+            <div class="session-meta" id="sessionMeta"></div>
+        </div>
+
+        <div class="progress-block">
+            <div class="progress-text" id="progressText">0 / 0 reviewed</div>
+            <div class="progress-track" aria-hidden="true">
+                <div id="progressBar"></div>
+            </div>
+        </div>
+
+        <nav class="nav-actions" aria-label="Page navigation">
+            <button id="prevButton" type="button">Prev</button>
+            <button id="nextButton" type="button">Next</button>
+            <button id="skipReviewedButton" type="button">Next open</button>
+            <button id="helpButton" type="button" class="icon-button" title="Keyboard shortcuts">?</button>
+        </nav>
+    </header>
+
+    <main class="workspace">
+        <section class="pane image-pane" aria-label="Raw page image">
+            <div class="pane-toolbar">
+                <div>
+                    <div class="pane-title">Raw page</div>
+                    <div class="pane-subtitle" id="imageSubtitle"></div>
+                </div>
+                <div class="zoom-actions">
+                    <button id="zoomOutButton" type="button" title="Zoom out">-</button>
+                    <button id="zoomResetButton" type="button" title="Reset zoom">100%</button>
+                    <button id="zoomInButton" type="button" title="Zoom in">+</button>
+                </div>
+            </div>
+            <div class="image-stage" id="imageStage">
+                <img id="rawImage" alt="Raw OCR source page">
+                <div id="imageMissing" class="missing-state" hidden>Raw image unavailable</div>
+            </div>
+        </section>
+
+        <section class="pane markdown-pane" aria-label="Extracted Markdown page">
+            <div class="pane-toolbar">
+                <div>
+                    <div class="pane-title">Extracted content</div>
+                    <div class="pane-subtitle" id="markdownSubtitle"></div>
+                </div>
+                <button id="toggleRawButton" type="button">Raw Markdown</button>
+            </div>
+            <article id="markdownPreview" class="markdown-preview"></article>
+            <pre id="rawMarkdown" class="raw-markdown" hidden></pre>
+        </section>
+
+        <aside class="annotation-panel" aria-label="Annotation controls">
+            <div class="panel-section report-card">
+                <div class="eyebrow">Current item</div>
+                <h1 id="reportName">Report</h1>
+                <dl>
+                    <div>
+                        <dt>Industry</dt>
+                        <dd id="industryValue"></dd>
+                    </div>
+                    <div>
+                        <dt>Ticker</dt>
+                        <dd id="tickerValue"></dd>
+                    </div>
+                    <div>
+                        <dt>Page</dt>
+                        <dd id="pageValue"></dd>
+                    </div>
+                    <div>
+                        <dt>Signals</dt>
+                        <dd id="signalsValue"></dd>
+                    </div>
+                    <div>
+                        <dt>Mapping</dt>
+                        <dd id="mappingValue"></dd>
+                    </div>
+                </dl>
+            </div>
+
+            <div class="panel-section decision-section">
+                <div class="section-label">Decision</div>
+                <div class="decision-buttons" role="group" aria-label="Overall status">
+                    <button type="button" class="status-button" data-status="ok">OK</button>
+                    <button type="button" class="status-button" data-status="not_ok">Not OK</button>
+                    <button type="button" class="status-button" data-status="uncertain">Uncertain</button>
+                </div>
+            </div>
+
+            <div class="panel-section subchecks-section">
+                <div class="section-label">Subchecks</div>
+                <label>Text content
+                    <select data-subcheck="text_content">
+                        <option value="unreviewed">Unreviewed</option>
+                        <option value="ok">OK</option>
+                        <option value="not_ok">Not OK</option>
+                        <option value="uncertain">Uncertain</option>
+                        <option value="not_applicable">N/A</option>
+                    </select>
+                </label>
+                <label>Table content
+                    <select data-subcheck="table_content">
+                        <option value="unreviewed">Unreviewed</option>
+                        <option value="ok">OK</option>
+                        <option value="not_ok">Not OK</option>
+                        <option value="uncertain">Uncertain</option>
+                        <option value="not_applicable">N/A</option>
+                    </select>
+                </label>
+                <label>Table structure
+                    <select data-subcheck="table_structure">
+                        <option value="unreviewed">Unreviewed</option>
+                        <option value="ok">OK</option>
+                        <option value="not_ok">Not OK</option>
+                        <option value="uncertain">Uncertain</option>
+                        <option value="not_applicable">N/A</option>
+                    </select>
+                </label>
+                <label>Page alignment
+                    <select data-subcheck="page_alignment">
+                        <option value="unreviewed">Unreviewed</option>
+                        <option value="ok">OK</option>
+                        <option value="not_ok">Not OK</option>
+                        <option value="uncertain">Uncertain</option>
+                        <option value="not_applicable">N/A</option>
+                    </select>
+                </label>
+            </div>
+
+            <div class="panel-section issues-section">
+                <div class="section-label">Issues</div>
+                <div class="issue-grid" id="issueGrid">
+                    <label><input type="checkbox" value="missing_text"> Missing text</label>
+                    <label><input type="checkbox" value="extra_text"> Extra text</label>
+                    <label><input type="checkbox" value="wrong_reading_order"> Reading order</label>
+                    <label><input type="checkbox" value="merged_columns"> Merged columns</label>
+                    <label><input type="checkbox" value="shifted_rows"> Shifted rows</label>
+                    <label><input type="checkbox" value="missing_columns"> Missing columns</label>
+                    <label><input type="checkbox" value="broken_table"> Broken table</label>
+                    <label><input type="checkbox" value="wrong_page"> Wrong page</label>
+                    <label><input type="checkbox" value="image_missing"> Image missing</label>
+                    <label><input type="checkbox" value="low_confidence"> Low confidence</label>
+                </div>
+            </div>
+
+            <div class="panel-section notes-section">
+                <label for="notesInput" class="section-label">Notes</label>
+                <textarea id="notesInput" rows="5" spellcheck="true"></textarea>
+            </div>
+
+            <div class="panel-actions">
+                <button id="saveButton" type="button" class="primary-button">Save</button>
+                <a id="summaryCsvLink" class="secondary-link" href="#">CSV</a>
+                <a id="summaryMdLink" class="secondary-link" href="#">Markdown</a>
+            </div>
+            <div id="saveStatus" class="save-status" aria-live="polite"></div>
+        </aside>
+    </main>
+
+    <dialog id="helpDialog" class="help-dialog">
+        <form method="dialog">
+            <div class="dialog-header">
+                <h2>Keyboard</h2>
+                <button type="submit" class="icon-button">x</button>
+            </div>
+            <div class="shortcut-grid">
+                <span>A</span>
+                <p>OK, save, next</p>
+                <span>R</span>
+                <p>Not OK, save, next</p>
+                <span>U</span>
+                <p>Uncertain, save, next</p>
+                <span>J / K</span>
+                <p>Next / previous</p>
+                <span>T</span>
+                <p>Broken table</p>
+                <span>C</span>
+                <p>Merged columns</p>
+                <span>M</span>
+                <p>Missing text</p>
+                <span>+ / - / 0</span>
+                <p>Zoom</p>
+            </div>
+        </form>
+    </dialog>
+</body>
+
+</html>
\ No newline at end of file

From 255ff285061148963e2d4f74d4ed19467823432a Mon Sep 17 00:00:00 2001
From: Charles Moslonka <charles.moslonka@artefact.com>
Date: Fri, 22 May 2026 11:54:35 +0200
Subject: [PATCH 4/8] DOC: explain OCR annotator

---
 annotation_OCR/README.md | 130 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 annotation_OCR/README.md

diff --git a/annotation_OCR/README.md b/annotation_OCR/README.md
new file mode 100644
index 0000000..598409e
--- /dev/null
+++ b/annotation_OCR/README.md
@@ -0,0 +1,130 @@
+# OCR Annotation Interface
+
+Browser interface for comparing raw OCR page images with the corresponding Markdown page extracted by DeepSeekOCR. The app stores page-level annotations under `annotation_OCR/sessions/` so quality labels can later be joined to LLM benchmark outputs.
+
+## Run
+
+From the repository root:
+
+```bash
+uv run python annotation_OCR/server.py \
+  --session-name "table QA smoke" \
+  --annotator "your-name" \
+  --queue-mode table-candidates \
+  --host 127.0.0.1 \
+  --port 5050
+```
+
+For a small smoke run:
+
+```bash
+uv run python annotation_OCR/server.py \
+  --session-name smoke \
+  --annotator test \
+  --queue-mode table-candidates \
+  --limit-reports 2 \
+  --limit 20 \
+  --host 127.0.0.1 \
+  --port 5050
+```
+
+Resume an existing session:
+
+```bash
+uv run python annotation_OCR/server.py --session-id SESSION_ID --host 127.0.0.1 --port 5050
+```
+
+SSH port forwarding from a laptop:
+
+```bash
+ssh -L 5050:127.0.0.1:5050 USER@SERVER
+```
+
+Then open `http://127.0.0.1:5050` locally.
+
+## Data Sources
+
+Defaults:
+
+- OCR Markdown root: `DeepSeekOCR_Ardian_pruned_1k/`
+- Raw image root: `/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs/`
+
+Each queued item maps one `.mmd` page split to the raw PNG with the same zero-based page index, for example page index `12` maps to `pages/page_0012.png`. The manifest records mapping warnings such as missing raw images or page-count mismatches.
+
+## Queue Modes
+
+- `table-candidates`: default. Keeps pages with table-like signals, dense numeric rows, financial statement headings, or KPI aliases.
+- `all`: queues every page.
+- `sample`: seeded random sample across all discovered pages. Use `--sample-size` and `--seed`.
+
+Indexer smoke check:
+
+```bash
+uv run python annotation_OCR/ocr_index.py \
+  --ocr-root DeepSeekOCR_Ardian_pruned_1k \
+  --raw-root /data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs \
+  --queue-mode table-candidates \
+  --limit-reports 2 \
+  --limit 20 \
+  --check
+```
+
+## Keyboard
+
+- `a`: mark OK, save, advance
+- `r`: mark Not OK, save, advance
+- `u`: mark Uncertain, save, advance
+- `j` / right arrow: next page
+- `k` / left arrow: previous page
+- `t`: toggle broken table
+- `c`: toggle merged columns
+- `m`: toggle missing text
+- `+`, `-`, `0`: zoom controls
+- `?`: shortcut dialog
+
+Shortcuts are disabled while typing in notes or editing form controls.
+
+## Outputs
+
+Each session writes to `annotation_OCR/sessions/{session_id}/`:
+
+- `metadata.json`: session name, annotator, configuration, counts, timestamps.
+- `manifest.json`: queued pages and mapping diagnostics.
+- `annotations.jsonl`: append-only event log, one saved annotation per line.
+- `current_annotations.json`: latest annotation per item, written atomically.
+- `summary.csv`: one row per queued page, including unreviewed pages.
+- `summary.md`: status and issue-count overview.
+
+Regenerate summaries:
+
+```bash
+uv run python annotation_OCR/summarize.py --session-id SESSION_ID
+uv run python annotation_OCR/summarize.py --all
+```
+
+## Annotation Schema
+
+Primary fields:
+
+- `overall_status`: `ok`, `not_ok`, `uncertain`, or `unreviewed`
+- `subchecks`: `text_content`, `table_content`, `table_structure`, `page_alignment`
+- `issue_tags`: `missing_text`, `extra_text`, `wrong_reading_order`, `merged_columns`, `shifted_rows`, `missing_columns`, `broken_table`, `wrong_page`, `image_missing`, `low_confidence`
+- `notes`: free text
+
+Identity fields include `industry_slug`, `report_name`, `exchange`, `ticker`, `year`, `page_index`, `page_number`, `mmd_path`, `raw_png_path`, and `page_text_sha256`.
+
+## Downstream Joins
+
+For page-level filtering, join annotation summaries on:
+
+```text
+exchange, ticker, year, page_index
+```
+
+For report-level benchmark filtering, aggregate page labels to:
+
+```text
+exchange, ticker, year
+```
+
+A conservative report-level rule is to exclude a report when any reviewed table-candidate page is `not_ok`, or when the share of `uncertain` pages exceeds a threshold chosen for the benchmark run.
\ No newline at end of file

From 8eb03e8f0f1fd5940d36a10768d289f491933df2 Mon Sep 17 00:00:00 2001
From: Charles Moslonka <charles.moslonka@artefact.com>
Date: Fri, 22 May 2026 19:37:39 +0200
Subject: [PATCH 5/8] ENH: let the user make the session name on start.

---
 annotation_OCR/README.md              |  30 ++-
 annotation_OCR/server.py              |  98 +++++++---
 annotation_OCR/static/app.js          | 125 +++++-------
 annotation_OCR/static/style.css       |  61 +++++-
 annotation_OCR/store.py               |  49 -----
 annotation_OCR/templates/index.html   |  89 ++-------
 annotation_OCR/templates/landing.html | 263 ++++++++++++++++++++++++++
 7 files changed, 474 insertions(+), 241 deletions(-)
 create mode 100644 annotation_OCR/templates/landing.html

diff --git a/annotation_OCR/README.md b/annotation_OCR/README.md
index 598409e..11e9641 100644
--- a/annotation_OCR/README.md
+++ b/annotation_OCR/README.md
@@ -4,6 +4,21 @@ Browser interface for comparing raw OCR page images with the corresponding Markd
 
 ## Run
 
+### Headless mode (recommended for multi-user)
+
+Start the server with no session arguments — annotators create/resume sessions
+from the browser landing page:
+
+```bash
+uv run python annotation_OCR/server.py --host 0.0.0.0 --port 5050
+```
+
+Then open `http://HOST:5050`. The landing page lets each user enter their name,
+create a new session, or resume an existing one. No CLI or Python knowledge
+needed on the annotator side.
+
+### Pre-created session (single-user / scripted)
+
 From the repository root:
 
 ```bash
@@ -42,6 +57,8 @@ ssh -L 5050:127.0.0.1:5050 USER@SERVER
 
 Then open `http://127.0.0.1:5050` locally.
 
+The extracted-content pane shows inline OCR images by default. Turn off `Inline images` if you want a lighter placeholder-only Markdown preview.
+
 ## Data Sources
 
 Defaults:
@@ -71,14 +88,11 @@ uv run python annotation_OCR/ocr_index.py \
 
 ## Keyboard
 
-- `a`: mark OK, save, advance
-- `r`: mark Not OK, save, advance
+- `a`: mark Yes, save, advance
+- `r`: mark No, save, advance
 - `u`: mark Uncertain, save, advance
 - `j` / right arrow: next page
 - `k` / left arrow: previous page
-- `t`: toggle broken table
-- `c`: toggle merged columns
-- `m`: toggle missing text
 - `+`, `-`, `0`: zoom controls
 - `?`: shortcut dialog
 
@@ -93,7 +107,7 @@ Each session writes to `annotation_OCR/sessions/{session_id}/`:
 - `annotations.jsonl`: append-only event log, one saved annotation per line.
 - `current_annotations.json`: latest annotation per item, written atomically.
 - `summary.csv`: one row per queued page, including unreviewed pages.
-- `summary.md`: status and issue-count overview.
+- `summary.md`: status-count overview.
 
 Regenerate summaries:
 
@@ -107,9 +121,7 @@ uv run python annotation_OCR/summarize.py --all
 Primary fields:
 
 - `overall_status`: `ok`, `not_ok`, `uncertain`, or `unreviewed`
-- `subchecks`: `text_content`, `table_content`, `table_structure`, `page_alignment`
-- `issue_tags`: `missing_text`, `extra_text`, `wrong_reading_order`, `merged_columns`, `shifted_rows`, `missing_columns`, `broken_table`, `wrong_page`, `image_missing`, `low_confidence`
-- `notes`: free text
+- `notes`: optional free text
 
 Identity fields include `industry_slug`, `report_name`, `exchange`, `ticker`, `year`, `page_index`, `page_number`, `mmd_path`, `raw_png_path`, and `page_text_sha256`.
 
diff --git a/annotation_OCR/server.py b/annotation_OCR/server.py
index 6ea3793..16460d5 100644
--- a/annotation_OCR/server.py
+++ b/annotation_OCR/server.py
@@ -10,7 +10,7 @@
 
 import bleach
 import markdown as markdown_lib
-from flask import Flask, abort, jsonify, render_template, request, send_file
+from flask import Flask, abort, jsonify, redirect, render_template, request, send_file
 
 from ocr_index import DEFAULT_OCR_ROOT, DEFAULT_RAW_ROOT, build_queue, load_pages
 from store import (
@@ -27,7 +27,6 @@
 
 HERE = Path(__file__).resolve().parent
 IMAGE_REF_RE = re.compile(r"(!\[[^\]]*\]\()((?:\./)?images/[^)\s]+)(\))")
-HTML_IMAGE_SRC_RE = re.compile(r'(<img\b[^>]*\bsrc=["\'])(images/[^"\']+)(["\'])', re.I)
 
 ALLOWED_TAGS = set(bleach.sanitizer.ALLOWED_TAGS).union(
     {
@@ -130,8 +129,13 @@ def cached_pages(mmd_path: str) -> tuple[str, ...]:
     return tuple(load_pages(Path(mmd_path)))
 
 
+@lru_cache(maxsize=16)
+def cached_manifest(session_id: str) -> tuple[dict[str, Any], ...]:
+    return tuple(load_manifest(session_id))
+
+
 def get_item_or_404(session_id: str, index: int) -> dict[str, Any]:
-    manifest = load_manifest(session_id)
+    manifest = cached_manifest(session_id)
     if index < 0 or index >= len(manifest):
         abort(404, description="item index out of range")
     return manifest[index]
@@ -145,6 +149,12 @@ def item_page_text(item: dict[str, Any]) -> str:
     return pages[page_index]
 
 
+def omit_markdown_image_refs(markdown_text: str) -> str:
+    return IMAGE_REF_RE.sub(
+        lambda match: f"_[image omitted: {match.group(2)}]_", markdown_text
+    )
+
+
 def rewrite_markdown_image_refs(markdown_text: str, session_id: str, index: int) -> str:
     def replace_md(match: re.Match[str]) -> str:
         rel_path = match.group(2).lstrip("./")
@@ -154,23 +164,22 @@ def replace_md(match: re.Match[str]) -> str:
     return IMAGE_REF_RE.sub(replace_md, markdown_text)
 
 
-def rewrite_html_image_refs(html: str, session_id: str, index: int) -> str:
-    def replace_html(match: re.Match[str]) -> str:
-        rel_path = match.group(2).lstrip("./")
-        src = f"/api/session/{session_id}/item/{index}/inline-image/{rel_path}"
-        return f"{match.group(1)}{src}{match.group(3)}"
-
-    return HTML_IMAGE_SRC_RE.sub(replace_html, html)
-
-
-def render_markdown_page(markdown_text: str, session_id: str, index: int) -> str:
-    rewritten = rewrite_markdown_image_refs(markdown_text, session_id, index)
+def render_markdown_page(
+    markdown_text: str,
+    *,
+    session_id: str,
+    index: int,
+    show_inline_images: bool,
+) -> str:
+    if show_inline_images:
+        rewritten = rewrite_markdown_image_refs(markdown_text, session_id, index)
+    else:
+        rewritten = omit_markdown_image_refs(markdown_text)
     html = markdown_lib.markdown(
         rewritten,
         extensions=["tables", "fenced_code", "sane_lists", "nl2br"],
         output_format="html5",
     )
-    html = rewrite_html_image_refs(html, session_id, index)
     return bleach.clean(
         html,
         tags=ALLOWED_TAGS,
@@ -192,7 +201,7 @@ def safe_child_path(root: Path, relative_path: str) -> Path:
 
 def progress_payload(session_id: str) -> dict[str, Any]:
     metadata = load_metadata(session_id)
-    manifest = load_manifest(session_id)
+    manifest = cached_manifest(session_id)
     current = load_current_annotations(session_id)
     status_counts: dict[str, int] = {}
     for item in manifest:
@@ -214,19 +223,30 @@ def progress_payload(session_id: str) -> dict[str, Any]:
     }
 
 
-def create_app(default_session_id: str, build_defaults: dict[str, Any]) -> Flask:
+def create_app(default_session_id: str | None, build_defaults: dict[str, Any]) -> Flask:
     app = Flask(__name__, template_folder="templates", static_folder="static")
     app.config["DEFAULT_SESSION_ID"] = default_session_id
     app.config["BUILD_DEFAULTS"] = build_defaults
 
     @app.get("/")
-    def index() -> str:
-        return render_template("index.html", default_session_id=default_session_id)
+    def index() -> Any:
+        # If ?session=<id> in URL, serve the annotation UI for that session
+        session_from_url = request.args.get("session")
+        if session_from_url:
+            return render_template("index.html", session_id=session_from_url)
+        # If server was started with a pre-created session, redirect to it
+        if default_session_id:
+            return redirect(f"/?session={default_session_id}")
+        # Otherwise show the landing / session picker page
+        return render_template("landing.html")
 
     @app.get("/api/sessions")
     def api_sessions() -> Any:
         return jsonify(
-            {"sessions": list_sessions(), "default_session_id": default_session_id}
+            {
+                "sessions": list_sessions(),
+                "default_session_id": default_session_id or None,
+            }
         )
 
     @app.post("/api/sessions")
@@ -261,6 +281,7 @@ def api_create_session() -> Any:
             index_summary=index_summary,
             config=config,
         )
+        cached_manifest.cache_clear()
         return jsonify(
             {"metadata": metadata, "progress": progress_payload(metadata["session_id"])}
         )
@@ -271,18 +292,30 @@ def api_session(session_id: str) -> Any:
 
     @app.get("/api/session/<session_id>/item/<int:index>")
     def api_item(session_id: str, index: int) -> Any:
+        manifest = cached_manifest(session_id)
         item = get_item_or_404(session_id, index)
         text = item_page_text(item)
         annotations = load_current_annotations(session_id)
+        show_inline_images = request.args.get("inline_images", "1") != "0"
+        next_image_url = None
+        if index + 1 < len(manifest) and manifest[index + 1].get("raw_png_path"):
+            next_image_url = f"/api/session/{session_id}/item/{index + 1}/raw-image"
         return jsonify(
             {
                 "index": index,
-                "item_count": len(load_manifest(session_id)),
+                "item_count": len(manifest),
                 "item": item,
                 "annotation": annotations.get(item["item_id"]),
                 "page_text": text,
-                "markdown_html": render_markdown_page(text, session_id, index),
+                "markdown_html": render_markdown_page(
+                    text,
+                    session_id=session_id,
+                    index=index,
+                    show_inline_images=show_inline_images,
+                ),
+                "inline_images": show_inline_images,
                 "image_url": f"/api/session/{session_id}/item/{index}/raw-image",
+                "next_image_url": next_image_url,
             }
         )
 
@@ -298,7 +331,7 @@ def api_raw_image(session_id: str, index: int) -> Any:
             abort(400, description="raw image outside raw root")
         if not target.is_file():
             abort(404, description="raw page image missing")
-        return send_file(target)
+        return send_file(target, conditional=True, max_age=86400)
 
     @app.get("/api/session/<session_id>/item/<int:index>/inline-image/<path:rel_path>")
     def api_inline_image(session_id: str, index: int, rel_path: str) -> Any:
@@ -307,7 +340,7 @@ def api_inline_image(session_id: str, index: int, rel_path: str) -> Any:
         target = safe_child_path(report_dir, rel_path)
         if not target.is_file():
             abort(404, description="inline OCR image missing")
-        return send_file(target)
+        return send_file(target, conditional=True, max_age=86400)
 
     @app.post("/api/session/<session_id>/annotation")
     def api_save_annotation(session_id: str) -> Any:
@@ -344,7 +377,15 @@ def api_summary_md(session_id: str) -> Any:
 
 def main(argv: list[str] | None = None) -> int:
     args = build_arg_parser().parse_args(argv)
-    session_id = prepare_session(args)
+    # Session creation is now optional — if no --session-id given and
+    # --session-name is the default placeholder, start headless so users
+    # can create/resume sessions from the browser landing page.
+    session_id: str | None = None
+    if args.session_id:
+        session_id = prepare_session(args)
+    elif args.annotator != "anonymous" or args.session_name != "OCR annotation session":
+        session_id = prepare_session(args)
+
     build_defaults = {
         "ocr_root": str(args.ocr_root),
         "raw_root": str(args.raw_root),
@@ -355,7 +396,12 @@ def main(argv: list[str] | None = None) -> int:
         "limit_reports": args.limit_reports,
     }
     app = create_app(session_id, build_defaults)
-    print(f"Annotation session: {session_id}")
+    if session_id:
+        print(f"Annotation session: {session_id}")
+    else:
+        print(
+            "Starting in headless mode — users will create sessions from the browser."
+        )
     print(f"Open: http://{args.host}:{args.port}")
     app.run(host=args.host, port=args.port, debug=args.debug)
     return 0
diff --git a/annotation_OCR/static/app.js b/annotation_OCR/static/app.js
index fd49ccc..880439d 100644
--- a/annotation_OCR/static/app.js
+++ b/annotation_OCR/static/app.js
@@ -1,5 +1,7 @@
 const state = {
-    sessionId: window.OCR_ANNOTATION_DEFAULT_SESSION_ID,
+    sessionId: window.OCR_ANNOTATION_SESSION_ID
+        || new URLSearchParams(window.location.search).get('session')
+        || window.OCR_ANNOTATION_DEFAULT_SESSION_ID,
     index: 0,
     itemCount: 0,
     item: null,
@@ -7,7 +9,9 @@ const state = {
     startedAt: null,
     zoom: 1,
     showingRaw: false,
+    showInlineImages: true,
     saving: false,
+    prefetchImage: null,
 };
 
 const els = {
@@ -19,12 +23,14 @@ const els = {
     nextButton: document.getElementById('nextButton'),
     skipReviewedButton: document.getElementById('skipReviewedButton'),
     helpButton: document.getElementById('helpButton'),
+    imageCanvas: document.getElementById('imageCanvas'),
     rawImage: document.getElementById('rawImage'),
     imageMissing: document.getElementById('imageMissing'),
     imageSubtitle: document.getElementById('imageSubtitle'),
     markdownSubtitle: document.getElementById('markdownSubtitle'),
     markdownPreview: document.getElementById('markdownPreview'),
     rawMarkdown: document.getElementById('rawMarkdown'),
+    inlineImagesToggle: document.getElementById('inlineImagesToggle'),
     toggleRawButton: document.getElementById('toggleRawButton'),
     zoomOutButton: document.getElementById('zoomOutButton'),
     zoomResetButton: document.getElementById('zoomResetButton'),
@@ -36,7 +42,6 @@ const els = {
     signalsValue: document.getElementById('signalsValue'),
     mappingValue: document.getElementById('mappingValue'),
     notesInput: document.getElementById('notesInput'),
-    issueGrid: document.getElementById('issueGrid'),
     saveButton: document.getElementById('saveButton'),
     saveStatus: document.getElementById('saveStatus'),
     summaryCsvLink: document.getElementById('summaryCsvLink'),
@@ -75,8 +80,7 @@ function updateProgress(progress) {
     const reviewed = progress.reviewed_count || 0;
     const total = progress.item_count || 0;
     els.progressText.textContent = `${reviewed} / ${total} reviewed`;
-    const pct = total ? Math.round((reviewed / total) * 100) : 0;
-    els.progressBar.style.width = `${pct}%`;
+    els.progressBar.style.width = `${total ? Math.round((reviewed / total) * 100) : 0}%`;
     els.summaryCsvLink.href = `/api/session/${state.sessionId}/summary.csv`;
     els.summaryMdLink.href = `/api/session/${state.sessionId}/summary.md`;
 }
@@ -88,43 +92,23 @@ function setOverall(status) {
     });
 }
 
-function setSubchecks(values = {}) {
-    document.querySelectorAll('[data-subcheck]').forEach((select) => {
-        select.value = values[select.dataset.subcheck] || 'unreviewed';
-    });
-}
-
-function setIssues(values = []) {
-    const selected = new Set(values);
-    els.issueGrid.querySelectorAll('input[type="checkbox"]').forEach((checkbox) => {
-        checkbox.checked = selected.has(checkbox.value);
-    });
-}
-
-function getSubchecks() {
-    const subchecks = {};
-    document.querySelectorAll('[data-subcheck]').forEach((select) => {
-        subchecks[select.dataset.subcheck] = select.value;
-    });
-    return subchecks;
-}
-
-function getIssues() {
-    return Array.from(els.issueGrid.querySelectorAll('input[type="checkbox"]:checked'))
-        .map((checkbox) => checkbox.value)
-        .sort();
-}
-
 function loadAnnotation(annotation) {
     setOverall(annotation?.overall_status || 'unreviewed');
-    setSubchecks(annotation?.subchecks || {});
-    setIssues(annotation?.issue_tags || []);
     els.notesInput.value = annotation?.notes || '';
 }
 
+function fittedImageWidth() {
+    const stage = els.imageCanvas.parentElement;
+    const availableWidth = Math.max(240, stage.clientWidth - 32);
+    const availableHeight = Math.max(240, stage.clientHeight - 32);
+    const naturalWidth = els.rawImage.naturalWidth || availableWidth;
+    const naturalHeight = els.rawImage.naturalHeight || naturalWidth * 1.414;
+    const fitScale = Math.min(availableWidth / naturalWidth, availableHeight / naturalHeight);
+    return Math.max(120, Math.floor(naturalWidth * fitScale));
+}
+
 function applyZoom() {
-    els.rawImage.style.transform = `scale(${state.zoom})`;
-    els.rawImage.style.marginBottom = `${Math.max(0, (state.zoom - 1) * 100)}%`;
+    els.imageCanvas.style.setProperty('--image-width', `${Math.round(fittedImageWidth() * state.zoom)}px`);
     els.zoomResetButton.textContent = `${Math.round(state.zoom * 100)}%`;
 }
 
@@ -139,9 +123,22 @@ async function loadProgress() {
     return progress;
 }
 
+function prefetchNextImage(url) {
+    if (!url) return;
+    state.prefetchImage = new Image();
+    state.prefetchImage.decoding = 'async';
+    state.prefetchImage.src = url;
+}
+
+function resetExtractedContentScroll() {
+    els.markdownPreview.scrollTop = 0;
+    els.rawMarkdown.scrollTop = 0;
+}
+
 async function loadItem(index) {
     const safeIndex = Math.max(0, Math.min(index, Math.max(0, state.itemCount - 1)));
-    const data = await apiJson(`/api/session/${state.sessionId}/item/${safeIndex}`);
+    const inlineFlag = state.showInlineImages ? '1' : '0';
+    const data = await apiJson(`/api/session/${state.sessionId}/item/${safeIndex}?inline_images=${inlineFlag}`);
     state.index = safeIndex;
     state.item = data.item;
     state.itemCount = data.item_count;
@@ -158,11 +155,13 @@ async function loadItem(index) {
 
     els.markdownPreview.innerHTML = data.markdown_html || '';
     els.rawMarkdown.textContent = data.page_text || '';
+    resetExtractedContentScroll();
 
     if (data.item.raw_png_path) {
         els.rawImage.hidden = false;
         els.imageMissing.hidden = true;
         els.rawImage.src = `${data.image_url}?v=${encodeURIComponent(data.item.page_text_sha256)}`;
+        prefetchNextImage(data.next_image_url);
     } else {
         els.rawImage.hidden = true;
         els.imageMissing.hidden = false;
@@ -180,8 +179,6 @@ function annotationPayload(source = 'manual') {
     return {
         item_id: state.item.item_id,
         overall_status: state.overallStatus,
-        subchecks: getSubchecks(),
-        issue_tags: getIssues(),
         notes: els.notesInput.value,
         annotation_source: source,
         review_duration_ms: state.startedAt ? new Date() - state.startedAt : null,
@@ -214,40 +211,9 @@ async function saveAnnotation(source = 'manual', advance = false) {
     }
 }
 
-function quickMark(status) {
+function quickMark(status, source = 'shortcut') {
     setOverall(status);
-    if (status === 'ok') {
-        setSubchecks({
-            text_content: 'ok',
-            table_content: 'ok',
-            table_structure: 'ok',
-            page_alignment: 'ok',
-        });
-        setIssues([]);
-    } else if (status === 'not_ok') {
-        const subchecks = getSubchecks();
-        if (Object.values(subchecks).every((value) => value === 'unreviewed')) {
-            setSubchecks({
-                text_content: 'uncertain',
-                table_content: 'uncertain',
-                table_structure: 'not_ok',
-                page_alignment: 'uncertain',
-            });
-        }
-    } else if (status === 'uncertain') {
-        setSubchecks({
-            text_content: 'uncertain',
-            table_content: 'uncertain',
-            table_structure: 'uncertain',
-            page_alignment: 'uncertain',
-        });
-    }
-    saveAnnotation(`shortcut:${status}`, true);
-}
-
-function toggleIssue(tag) {
-    const checkbox = els.issueGrid.querySelector(`input[value="${tag}"]`);
-    if (checkbox) checkbox.checked = !checkbox.checked;
+    saveAnnotation(`${source}:${status}`, true);
 }
 
 async function go(delta) {
@@ -283,13 +249,19 @@ function setupEvents() {
     els.nextButton.addEventListener('click', () => go(1));
     els.skipReviewedButton.addEventListener('click', goNextOpen);
     els.saveButton.addEventListener('click', () => saveAnnotation('manual', false));
+    els.inlineImagesToggle.addEventListener('change', () => {
+        state.showInlineImages = els.inlineImagesToggle.checked;
+        loadItem(state.index);
+    });
     els.toggleRawButton.addEventListener('click', toggleRawMarkdown);
     els.zoomOutButton.addEventListener('click', () => setZoom(state.zoom - 0.15));
     els.zoomInButton.addEventListener('click', () => setZoom(state.zoom + 0.15));
     els.zoomResetButton.addEventListener('click', () => setZoom(1));
     els.helpButton.addEventListener('click', () => els.helpDialog.showModal());
+    els.rawImage.addEventListener('load', () => setZoom(1));
+    window.addEventListener('resize', applyZoom);
     document.querySelectorAll('.status-button').forEach((button) => {
-        button.addEventListener('click', () => setOverall(button.dataset.status));
+        button.addEventListener('click', () => quickMark(button.dataset.status, 'button'));
     });
 
     document.addEventListener('keydown', (event) => {
@@ -312,15 +284,6 @@ function setupEvents() {
         } else if (event.key === 'ArrowLeft' || event.key.toLowerCase() === 'k') {
             event.preventDefault();
             go(-1);
-        } else if (event.key.toLowerCase() === 't') {
-            event.preventDefault();
-            toggleIssue('broken_table');
-        } else if (event.key.toLowerCase() === 'c') {
-            event.preventDefault();
-            toggleIssue('merged_columns');
-        } else if (event.key.toLowerCase() === 'm') {
-            event.preventDefault();
-            toggleIssue('missing_text');
         } else if (event.key === '+' || event.key === '=') {
             event.preventDefault();
             setZoom(state.zoom + 0.15);
diff --git a/annotation_OCR/static/style.css b/annotation_OCR/static/style.css
index deaa160..a5e8305 100644
--- a/annotation_OCR/static/style.css
+++ b/annotation_OCR/static/style.css
@@ -92,6 +92,12 @@ button:hover,
     font-size: 12px;
 }
 
+.pane-subtitle {
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+}
+
 .progress-block {
     display: grid;
     gap: 7px;
@@ -118,6 +124,7 @@ button:hover,
     display: flex;
     gap: 8px;
     align-items: center;
+    flex: 0 0 auto;
 }
 
 .icon-button {
@@ -128,7 +135,7 @@ button:hover,
 
 .workspace {
     display: grid;
-    grid-template-columns: minmax(340px, 1.05fr) minmax(340px, 1fr) 340px;
+    grid-template-columns: minmax(280px, 1.05fr) minmax(280px, 1fr) minmax(280px, 340px);
     gap: 14px;
     padding: 14px;
     height: calc(100vh - 82px);
@@ -136,6 +143,7 @@ button:hover,
 
 .pane,
 .annotation-panel {
+    min-width: 0;
     min-height: 0;
     background: var(--panel);
     border: 1px solid var(--line);
@@ -154,11 +162,47 @@ button:hover,
     justify-content: space-between;
     gap: 12px;
     align-items: center;
+    min-width: 0;
+    overflow: hidden;
     padding: 12px;
     border-bottom: 1px solid var(--line);
     background: var(--panel-2);
 }
 
+.pane-toolbar>div:first-child {
+    flex: 1 1 auto;
+    min-width: 0;
+}
+
+.zoom-actions {
+    margin-left: auto;
+}
+
+.preview-actions {
+    display: flex;
+    flex: 0 0 auto;
+    align-items: center;
+    gap: 10px;
+}
+
+.toggle-control {
+    display: inline-flex;
+    align-items: center;
+    gap: 6px;
+    color: var(--muted);
+    font-size: 12px;
+    white-space: nowrap;
+}
+
+.zoom-actions button {
+    width: 36px;
+    padding: 0;
+}
+
+.zoom-actions #zoomResetButton {
+    width: 58px;
+}
+
 .pane-title {
     font-size: 15px;
     font-weight: 800;
@@ -167,8 +211,7 @@ button:hover,
 .image-stage {
     position: relative;
     overflow: auto;
-    display: grid;
-    place-items: start center;
+    display: block;
     padding: 16px;
     background:
         linear-gradient(45deg, #dce3e4 25%, transparent 25%),
@@ -179,11 +222,19 @@ button:hover,
     background-position: 0 0, 0 11px, 11px -11px, -11px 0;
 }
 
+.image-canvas {
+    --image-width: 320px;
+    display: flex;
+    justify-content: center;
+    align-items: flex-start;
+    min-width: max(100%, var(--image-width));
+    min-height: 100%;
+}
+
 #rawImage {
     display: block;
+    width: var(--image-width);
     max-width: none;
-    width: min(100%, 900px);
-    transform-origin: top center;
     border: 1px solid #b7c3c5;
     background: white;
     box-shadow: 0 12px 28px rgba(31, 45, 49, 0.18);
diff --git a/annotation_OCR/store.py b/annotation_OCR/store.py
index e2cbe3a..32916b6 100644
--- a/annotation_OCR/store.py
+++ b/annotation_OCR/store.py
@@ -17,7 +17,6 @@
 SCHEMA_VERSION = "1.0"
 
 VALID_OVERALL_STATUS = {"ok", "not_ok", "uncertain", "unreviewed"}
-VALID_SUBCHECK_STATUS = {"ok", "not_ok", "uncertain", "not_applicable", "unreviewed"}
 
 SUMMARY_FIELDS = [
     "session_id",
@@ -32,11 +31,6 @@
     "page_index",
     "page_number",
     "overall_status",
-    "text_content",
-    "table_content",
-    "table_structure",
-    "page_alignment",
-    "issue_tags",
     "notes",
     "updated_at_utc",
     "annotation_source",
@@ -189,35 +183,10 @@ def sanitize_status(value: Any, valid: set[str], default: str) -> str:
 
 
 def normalize_annotation_payload(payload: dict[str, Any]) -> dict[str, Any]:
-    subchecks = (
-        payload.get("subchecks") if isinstance(payload.get("subchecks"), dict) else {}
-    )
-    normalized_subchecks = {
-        "text_content": sanitize_status(
-            subchecks.get("text_content"), VALID_SUBCHECK_STATUS, "unreviewed"
-        ),
-        "table_content": sanitize_status(
-            subchecks.get("table_content"), VALID_SUBCHECK_STATUS, "unreviewed"
-        ),
-        "table_structure": sanitize_status(
-            subchecks.get("table_structure"), VALID_SUBCHECK_STATUS, "unreviewed"
-        ),
-        "page_alignment": sanitize_status(
-            subchecks.get("page_alignment"), VALID_SUBCHECK_STATUS, "unreviewed"
-        ),
-    }
-
-    issue_tags = payload.get("issue_tags")
-    if not isinstance(issue_tags, list):
-        issue_tags = []
-    issue_tags = sorted({str(tag).strip() for tag in issue_tags if str(tag).strip()})
-
     return {
         "overall_status": sanitize_status(
             payload.get("overall_status"), VALID_OVERALL_STATUS, "unreviewed"
         ),
-        "subchecks": normalized_subchecks,
-        "issue_tags": issue_tags,
         "notes": str(payload.get("notes") or "").strip(),
         "annotation_source": str(payload.get("annotation_source") or "manual"),
         "review_duration_ms": payload.get("review_duration_ms"),
@@ -297,7 +266,6 @@ def summary_rows(session_id: str) -> list[dict[str, Any]]:
     rows: list[dict[str, Any]] = []
     for item in load_manifest(session_id):
         annotation = current.get(item["item_id"], {})
-        subchecks = annotation.get("subchecks", {}) if annotation else {}
         rows.append(
             {
                 "session_id": session_id,
@@ -312,11 +280,6 @@ def summary_rows(session_id: str) -> list[dict[str, Any]]:
                 "page_index": item.get("page_index"),
                 "page_number": item.get("page_number"),
                 "overall_status": annotation.get("overall_status", "unreviewed"),
-                "text_content": subchecks.get("text_content", "unreviewed"),
-                "table_content": subchecks.get("table_content", "unreviewed"),
-                "table_structure": subchecks.get("table_structure", "unreviewed"),
-                "page_alignment": subchecks.get("page_alignment", "unreviewed"),
-                "issue_tags": ";".join(annotation.get("issue_tags", [])),
                 "notes": annotation.get("notes", ""),
                 "updated_at_utc": annotation.get("updated_at_utc", ""),
                 "annotation_source": annotation.get("annotation_source", ""),
@@ -347,11 +310,6 @@ def write_summary_csv(path: Path, rows: list[dict[str, Any]]) -> None:
 def write_summary_md(path: Path, rows: list[dict[str, Any]]) -> None:
     metadata = load_metadata(path.parent.name)
     status_counts = Counter(row["overall_status"] for row in rows)
-    issue_counts: Counter[str] = Counter()
-    for row in rows:
-        for tag in str(row.get("issue_tags") or "").split(";"):
-            if tag:
-                issue_counts[tag] += 1
 
     reviewed = len(rows) - status_counts.get("unreviewed", 0)
     lines = [
@@ -371,13 +329,6 @@ def write_summary_md(path: Path, rows: list[dict[str, Any]]) -> None:
     for status, count in sorted(status_counts.items()):
         lines.append(f"| {status} | {count} |")
 
-    lines.extend(["", "## Issue Counts", "", "| Issue | Count |", "| --- | ---: |"])
-    if issue_counts:
-        for issue, count in issue_counts.most_common():
-            lines.append(f"| {issue} | {count} |")
-    else:
-        lines.append("| none | 0 |")
-
     atomic_write_text(path, "\n".join(lines) + "\n")
 
 
diff --git a/annotation_OCR/templates/index.html b/annotation_OCR/templates/index.html
index 2c21e6c..4254140 100644
--- a/annotation_OCR/templates/index.html
+++ b/annotation_OCR/templates/index.html
@@ -7,7 +7,7 @@
     <title>OCR Annotation</title>
     <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
     <script>
-        window.OCR_ANNOTATION_DEFAULT_SESSION_ID = {{ default_session_id | tojson }};
+        window.OCR_ANNOTATION_SESSION_ID = {{ session_id | tojson }};
     </script>
     <script defer src="{{ url_for('static', filename='app.js') }}"></script>
 </head>
@@ -28,6 +28,7 @@
         </div>
 
         <nav class="nav-actions" aria-label="Page navigation">
+            <a href="/" class="secondary-link" title="Back to session list">Sessions</a>
             <button id="prevButton" type="button">Prev</button>
             <button id="nextButton" type="button">Next</button>
             <button id="skipReviewedButton" type="button">Next open</button>
@@ -49,7 +50,9 @@
                 </div>
             </div>
             <div class="image-stage" id="imageStage">
-                <img id="rawImage" alt="Raw OCR source page">
+                <div class="image-canvas" id="imageCanvas">
+                    <img id="rawImage" alt="Raw OCR source page" decoding="async" fetchpriority="high">
+                </div>
                 <div id="imageMissing" class="missing-state" hidden>Raw image unavailable</div>
             </div>
         </section>
@@ -60,7 +63,13 @@
                     <div class="pane-title">Extracted content</div>
                     <div class="pane-subtitle" id="markdownSubtitle"></div>
                 </div>
-                <button id="toggleRawButton" type="button">Raw Markdown</button>
+                <div class="preview-actions">
+                    <label class="toggle-control">
+                        <input id="inlineImagesToggle" type="checkbox" checked>
+                        Inline images
+                    </label>
+                    <button id="toggleRawButton" type="button">Raw Markdown</button>
+                </div>
             </div>
             <article id="markdownPreview" class="markdown-preview"></article>
             <pre id="rawMarkdown" class="raw-markdown" hidden></pre>
@@ -97,75 +106,19 @@ <h1 id="reportName">Report</h1>
             <div class="panel-section decision-section">
                 <div class="section-label">Decision</div>
                 <div class="decision-buttons" role="group" aria-label="Overall status">
-                    <button type="button" class="status-button" data-status="ok">OK</button>
-                    <button type="button" class="status-button" data-status="not_ok">Not OK</button>
+                    <button type="button" class="status-button" data-status="ok">Yes</button>
+                    <button type="button" class="status-button" data-status="not_ok">No</button>
                     <button type="button" class="status-button" data-status="uncertain">Uncertain</button>
                 </div>
             </div>
 
-            <div class="panel-section subchecks-section">
-                <div class="section-label">Subchecks</div>
-                <label>Text content
-                    <select data-subcheck="text_content">
-                        <option value="unreviewed">Unreviewed</option>
-                        <option value="ok">OK</option>
-                        <option value="not_ok">Not OK</option>
-                        <option value="uncertain">Uncertain</option>
-                        <option value="not_applicable">N/A</option>
-                    </select>
-                </label>
-                <label>Table content
-                    <select data-subcheck="table_content">
-                        <option value="unreviewed">Unreviewed</option>
-                        <option value="ok">OK</option>
-                        <option value="not_ok">Not OK</option>
-                        <option value="uncertain">Uncertain</option>
-                        <option value="not_applicable">N/A</option>
-                    </select>
-                </label>
-                <label>Table structure
-                    <select data-subcheck="table_structure">
-                        <option value="unreviewed">Unreviewed</option>
-                        <option value="ok">OK</option>
-                        <option value="not_ok">Not OK</option>
-                        <option value="uncertain">Uncertain</option>
-                        <option value="not_applicable">N/A</option>
-                    </select>
-                </label>
-                <label>Page alignment
-                    <select data-subcheck="page_alignment">
-                        <option value="unreviewed">Unreviewed</option>
-                        <option value="ok">OK</option>
-                        <option value="not_ok">Not OK</option>
-                        <option value="uncertain">Uncertain</option>
-                        <option value="not_applicable">N/A</option>
-                    </select>
-                </label>
-            </div>
-
-            <div class="panel-section issues-section">
-                <div class="section-label">Issues</div>
-                <div class="issue-grid" id="issueGrid">
-                    <label><input type="checkbox" value="missing_text"> Missing text</label>
-                    <label><input type="checkbox" value="extra_text"> Extra text</label>
-                    <label><input type="checkbox" value="wrong_reading_order"> Reading order</label>
-                    <label><input type="checkbox" value="merged_columns"> Merged columns</label>
-                    <label><input type="checkbox" value="shifted_rows"> Shifted rows</label>
-                    <label><input type="checkbox" value="missing_columns"> Missing columns</label>
-                    <label><input type="checkbox" value="broken_table"> Broken table</label>
-                    <label><input type="checkbox" value="wrong_page"> Wrong page</label>
-                    <label><input type="checkbox" value="image_missing"> Image missing</label>
-                    <label><input type="checkbox" value="low_confidence"> Low confidence</label>
-                </div>
-            </div>
-
             <div class="panel-section notes-section">
-                <label for="notesInput" class="section-label">Notes</label>
+                <label for="notesInput" class="section-label">Note, if needed</label>
                 <textarea id="notesInput" rows="5" spellcheck="true"></textarea>
             </div>
 
             <div class="panel-actions">
-                <button id="saveButton" type="button" class="primary-button">Save</button>
+                <button id="saveButton" type="button" class="primary-button">Save note</button>
                 <a id="summaryCsvLink" class="secondary-link" href="#">CSV</a>
                 <a id="summaryMdLink" class="secondary-link" href="#">Markdown</a>
             </div>
@@ -181,19 +134,13 @@ <h2>Keyboard</h2>
             </div>
             <div class="shortcut-grid">
                 <span>A</span>
-                <p>OK, save, next</p>
+                <p>Yes, save, next</p>
                 <span>R</span>
-                <p>Not OK, save, next</p>
+                <p>No, save, next</p>
                 <span>U</span>
                 <p>Uncertain, save, next</p>
                 <span>J / K</span>
                 <p>Next / previous</p>
-                <span>T</span>
-                <p>Broken table</p>
-                <span>C</span>
-                <p>Merged columns</p>
-                <span>M</span>
-                <p>Missing text</p>
                 <span>+ / - / 0</span>
                 <p>Zoom</p>
             </div>
diff --git a/annotation_OCR/templates/landing.html b/annotation_OCR/templates/landing.html
new file mode 100644
index 0000000..5a69e0d
--- /dev/null
+++ b/annotation_OCR/templates/landing.html
@@ -0,0 +1,263 @@
+<!doctype html>
+<html lang="en">
+
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>OCR Annotation — Start</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
+    <style>
+        .landing {
+            max-width: 640px;
+            margin: 60px auto;
+            padding: 0 24px;
+        }
+
+        .landing h1 {
+            font-size: 1.6rem;
+            margin: 0 0 8px;
+        }
+
+        .landing .subtitle {
+            color: var(--muted);
+            margin: 0 0 32px;
+        }
+
+        .card {
+            background: var(--panel);
+            border: 1px solid var(--line);
+            border-radius: 10px;
+            padding: 24px;
+            margin-bottom: 24px;
+            box-shadow: var(--shadow);
+        }
+
+        .card h2 {
+            font-size: 1.1rem;
+            margin: 0 0 16px;
+        }
+
+        .form-row {
+            display: flex;
+            gap: 12px;
+            align-items: flex-end;
+            flex-wrap: wrap;
+        }
+
+        .form-field {
+            display: flex;
+            flex-direction: column;
+            gap: 4px;
+            flex: 1;
+            min-width: 160px;
+        }
+
+        .form-field label {
+            font-size: 0.82rem;
+            color: var(--muted);
+            font-weight: 500;
+        }
+
+        .form-field input {
+            border: 1px solid var(--line);
+            border-radius: 6px;
+            padding: 8px 12px;
+            font: inherit;
+            background: var(--panel-2);
+        }
+
+        .form-field input:focus {
+            outline: 2px solid var(--teal);
+            outline-offset: -1px;
+        }
+
+        .create-button {
+            background: var(--teal);
+            color: #fff;
+            border: none;
+            border-radius: 6px;
+            padding: 8px 20px;
+            min-height: 38px;
+            cursor: pointer;
+            font-weight: 500;
+        }
+
+        .create-button:hover {
+            background: var(--teal-dark);
+        }
+
+        .create-button:disabled {
+            opacity: 0.5;
+            cursor: not-allowed;
+        }
+
+        .session-list {
+            list-style: none;
+            margin: 0;
+            padding: 0;
+        }
+
+        .session-list li {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            padding: 12px 0;
+            border-bottom: 1px solid var(--line);
+        }
+
+        .session-list li:last-child {
+            border-bottom: none;
+        }
+
+        .session-info {
+            display: flex;
+            flex-direction: column;
+            gap: 2px;
+        }
+
+        .session-info .name {
+            font-weight: 500;
+        }
+
+        .session-info .meta {
+            font-size: 0.82rem;
+            color: var(--muted);
+        }
+
+        .resume-link {
+            background: var(--panel-2);
+            border: 1px solid var(--line);
+            border-radius: 6px;
+            padding: 6px 14px;
+            text-decoration: none;
+            color: var(--teal);
+            font-weight: 500;
+            font-size: 0.9rem;
+        }
+
+        .resume-link:hover {
+            background: var(--teal);
+            color: #fff;
+        }
+
+        .empty-state {
+            color: var(--muted);
+            font-style: italic;
+        }
+
+        .error-msg {
+            color: var(--red);
+            margin-top: 8px;
+            font-size: 0.88rem;
+        }
+    </style>
+</head>
+
+<body>
+    <div class="landing">
+        <h1>OCR Annotation</h1>
+        <p class="subtitle">Enter your name to start a new annotation session, or resume an existing one below.</p>
+
+        <div class="card">
+            <h2>New session</h2>
+            <form id="createForm" class="form-row">
+                <div class="form-field">
+                    <label for="annotatorInput">Your name</label>
+                    <input id="annotatorInput" type="text" placeholder="e.g. Alice" required autocomplete="name">
+                </div>
+                <div class="form-field">
+                    <label for="sessionNameInput">Session name <span
+                            style="font-weight:normal">(optional)</span></label>
+                    <input id="sessionNameInput" type="text" placeholder="Auto-generated if blank">
+                </div>
+                <button type="submit" class="create-button" id="createButton">Create</button>
+            </form>
+            <div id="createError" class="error-msg" hidden></div>
+        </div>
+
+        <div class="card">
+            <h2>Resume existing session</h2>
+            <ul class="session-list" id="sessionList">
+                <li class="empty-state">Loading…</li>
+            </ul>
+        </div>
+    </div>
+
+    <script>
+        const sessionList = document.getElementById('sessionList');
+        const createForm = document.getElementById('createForm');
+        const createButton = document.getElementById('createButton');
+        const createError = document.getElementById('createError');
+        const annotatorInput = document.getElementById('annotatorInput');
+        const sessionNameInput = document.getElementById('sessionNameInput');
+
+        function formatDate(iso) {
+            if (!iso) return '';
+            const d = new Date(iso);
+            return d.toLocaleDateString(undefined, { day: 'numeric', month: 'short', year: 'numeric' });
+        }
+
+        async function loadSessions() {
+            try {
+                const resp = await fetch('/api/sessions');
+                const data = await resp.json();
+                const sessions = data.sessions || [];
+                if (sessions.length === 0) {
+                    sessionList.innerHTML = '<li class="empty-state">No sessions yet — create one above.</li>';
+                    return;
+                }
+                sessionList.innerHTML = sessions.map(s => {
+                    const reviewed = s.completed_count || 0;
+                    const total = s.item_count || 0;
+                    const pct = total ? Math.round((reviewed / total) * 100) : 0;
+                    return `<li>
+                        <div class="session-info">
+                            <span class="name">${esc(s.session_name)}</span>
+                            <span class="meta">${esc(s.annotator)} · ${reviewed}/${total} (${pct}%) · ${formatDate(s.updated_at_utc || s.created_at_utc)}</span>
+                        </div>
+                        <a class="resume-link" href="/?session=${encodeURIComponent(s.session_id)}">Resume</a>
+                    </li>`;
+                }).join('');
+            } catch (e) {
+                sessionList.innerHTML = `<li class="empty-state">Failed to load sessions.</li>`;
+            }
+        }
+
+        function esc(str) {
+            const div = document.createElement('div');
+            div.textContent = str || '';
+            return div.innerHTML;
+        }
+
+        createForm.addEventListener('submit', async (e) => {
+            e.preventDefault();
+            const annotator = annotatorInput.value.trim();
+            if (!annotator) return;
+            createButton.disabled = true;
+            createError.hidden = true;
+            const sessionName = sessionNameInput.value.trim() || `${annotator}'s session`;
+            try {
+                const resp = await fetch('/api/sessions', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ annotator, session_name: sessionName }),
+                });
+                if (!resp.ok) {
+                    const text = await resp.text();
+                    throw new Error(text || `${resp.status}`);
+                }
+                const data = await resp.json();
+                const sid = data.metadata.session_id;
+                window.location.href = `/?session=${encodeURIComponent(sid)}`;
+            } catch (err) {
+                createError.textContent = `Error: ${err.message}`;
+                createError.hidden = false;
+                createButton.disabled = false;
+            }
+        });
+
+        loadSessions();
+    </script>
+</body>
+
+</html>
\ No newline at end of file

From b6416534a116a514fd56a3c7855e3a1b863550e7 Mon Sep 17 00:00:00 2001
From: Charles Moslonka <charles.moslonka@artefact.com>
Date: Thu, 28 May 2026 14:59:25 +0200
Subject: [PATCH 6/8] ENH: Focus on tables : zoom and red border across images

---
 annotation_OCR/ocr_index.py         | 370 ++++++++++++++++++++++++++--
 annotation_OCR/server.py            | 177 ++++++++++++-
 annotation_OCR/static/app.js        | 235 +++++++++++++++++-
 annotation_OCR/static/style.css     |  37 ++-
 annotation_OCR/store.py             |  18 ++
 annotation_OCR/templates/index.html |   4 +
 6 files changed, 791 insertions(+), 50 deletions(-)

diff --git a/annotation_OCR/ocr_index.py b/annotation_OCR/ocr_index.py
index 8916981..fb7ac21 100644
--- a/annotation_OCR/ocr_index.py
+++ b/annotation_OCR/ocr_index.py
@@ -1,14 +1,16 @@
-"""Build page-level OCR annotation queues.
+"""Build OCR annotation queues.
 
-The annotation UI compares one raw page image with the corresponding Markdown
-page extracted by DeepSeekOCR. Page positions are preserved exactly: page index
-``i`` in an ``.mmd`` split maps to ``pages/page_XXXX.png`` with the same
-zero-based index when the raw image exists.
+The annotation UI can work either at page level from canonical ``.mmd`` files
+or at table level from ``*_det.mmd`` files that carry OCR coordinates.
+Page positions are preserved exactly: page index ``i`` in an ``.mmd`` split
+maps to ``pages/page_XXXX.png`` with the same zero-based index when the raw
+image exists.
 """
 
 from __future__ import annotations
 
 import argparse
+import html
 import hashlib
 import json
 import random
@@ -29,6 +31,12 @@
 PAGE_SPLIT_RE = re.compile(r"<---\s*Page Split\s*--->", re.IGNORECASE)
 REPORT_NAME_RE = re.compile(r"^([A-Z0-9-]+)_(.+)_(\d{4})(?:_[0-9a-fA-F]{8,})?$")
 HASH_SUFFIX_RE = re.compile(r"_[0-9a-fA-F]{8,}$")
+DET_HEADER_RE = re.compile(
+    r"(?m)^<\|ref\|>([^<]+)<\|/ref\|><\|det\|>(.*?)<\|/det\|>\s*$"
+)
+HTML_ROW_RE = re.compile(r"<tr\b[^>]*>(.*?)</tr>", re.IGNORECASE | re.DOTALL)
+HTML_CELL_RE = re.compile(r"<t[dh]\b[^>]*>(.*?)</t[dh]>", re.IGNORECASE | re.DOTALL)
+HTML_TAG_RE = re.compile(r"<[^>]+>")
 
 CORE_KPI_ALIASES = {
     "revenue": [
@@ -106,6 +114,25 @@ class ReportInfo:
     year: int
     report_dir: Path
     mmd_path: Path
+    det_mmd_path: Path | None
+
+
+@dataclass(frozen=True)
+class DetBlock:
+    ref_type: str
+    bbox_raw: str
+    bboxes: list[list[int]]
+    payload: str
+
+
+@dataclass(frozen=True)
+class TableSourceInfo:
+    report_dir: Path
+    mmd_path: Path
+    det_mmd_path: Path
+    page_pngs: list[Path]
+    mapping_status: str
+    source_warning: str | None = None
 
 
 @dataclass
@@ -133,6 +160,16 @@ class PageItem:
     page_text_chars: int
     page_text_preview: str
     page_text: str
+    item_kind: str = "page"
+    det_mmd_path: str | None = None
+    table_index: int | None = None
+    table_row_count: int | None = None
+    table_col_count: int | None = None
+    focus_bbox: list[int] | None = None
+    focus_bboxes: list[list[int]] | None = None
+    table_html: str | None = None
+    context_before: str = ""
+    context_after: str = ""
 
     def to_manifest_record(self, *, include_text: bool = False) -> dict[str, Any]:
         record = asdict(self)
@@ -179,6 +216,19 @@ def find_mmd(report_dir: Path) -> Path | None:
     return fallback[0] if fallback else None
 
 
+def find_det_mmd(report_dir: Path) -> Path | None:
+    preferred = report_dir / f"{report_dir.name}_det.mmd"
+    if preferred.is_file():
+        return preferred
+
+    base_preferred = report_dir / f"{report_base_name(report_dir.name)}_det.mmd"
+    if base_preferred.is_file():
+        return base_preferred
+
+    candidates = sorted(report_dir.glob("*_det.mmd"))
+    return candidates[0] if candidates else None
+
+
 def discover_reports(root: Path) -> list[ReportInfo]:
     reports: list[ReportInfo] = []
     seen_dirs = sorted({mmd.parent for mmd in root.rglob("*.mmd")})
@@ -187,7 +237,8 @@ def discover_reports(root: Path) -> list[ReportInfo]:
         if parsed is None:
             continue
         mmd_path = find_mmd(report_dir)
-        if mmd_path is None:
+        det_mmd_path = find_det_mmd(report_dir)
+        if mmd_path is None and det_mmd_path is None:
             continue
         exchange, ticker, year = parsed
         industry_slug = report_dir.parent.name
@@ -199,7 +250,8 @@ def discover_reports(root: Path) -> list[ReportInfo]:
                 ticker=ticker,
                 year=year,
                 report_dir=report_dir,
-                mmd_path=mmd_path,
+                mmd_path=mmd_path or det_mmd_path,
+                det_mmd_path=det_mmd_path,
             )
         )
     return reports
@@ -217,6 +269,92 @@ def load_pages(mmd_path: Path) -> list[str]:
     return split_pages(raw)
 
 
+def parse_bboxes(raw: str) -> list[list[int]]:
+    coords = [int(value) for value in re.findall(r"-?\d+", raw)]
+    boxes: list[list[int]] = []
+    for index in range(0, len(coords), 4):
+        chunk = coords[index : index + 4]
+        if len(chunk) == 4:
+            boxes.append(chunk)
+    return boxes
+
+
+def parse_det_blocks(page_text: str) -> list[DetBlock]:
+    matches = list(DET_HEADER_RE.finditer(page_text))
+    if not matches:
+        return []
+
+    blocks: list[DetBlock] = []
+    for index, match in enumerate(matches):
+        payload_start = match.end()
+        payload_end = (
+            matches[index + 1].start() if index + 1 < len(matches) else len(page_text)
+        )
+        payload = page_text[payload_start:payload_end].strip()
+        bbox_raw = match.group(2).strip()
+        blocks.append(
+            DetBlock(
+                ref_type=match.group(1).strip().lower(),
+                bbox_raw=bbox_raw,
+                bboxes=parse_bboxes(bbox_raw),
+                payload=payload,
+            )
+        )
+    return blocks
+
+
+def strip_html(value: str) -> str:
+    text = HTML_TAG_RE.sub(" ", value)
+    return " ".join(html.unescape(text).split())
+
+
+def table_dimensions(table_html: str) -> tuple[int, int]:
+    row_count = 0
+    col_count = 0
+    for row_html in HTML_ROW_RE.findall(table_html):
+        row_count += 1
+        col_count = max(col_count, len(HTML_CELL_RE.findall(row_html)))
+    return row_count, col_count
+
+
+def combined_bbox(bboxes: list[list[int]]) -> list[int] | None:
+    if not bboxes:
+        return None
+    return [
+        min(box[0] for box in bboxes),
+        min(box[1] for box in bboxes),
+        max(box[2] for box in bboxes),
+        max(box[3] for box in bboxes),
+    ]
+
+
+def nearby_context(blocks: list[DetBlock], block_index: int, *, direction: int) -> str:
+    collected: list[str] = []
+    index = block_index + direction
+    while 0 <= index < len(blocks) and len(collected) < 2:
+        block = blocks[index]
+        if block.ref_type in {"text", "title", "sub_title"} and block.payload:
+            collected.append(strip_html(block.payload))
+        index += direction
+    if direction < 0:
+        collected.reverse()
+    return "\n".join(value for value in collected if value)
+
+
+def detect_table_reasons(
+    table_html: str, context_before: str, context_after: str
+) -> list[str]:
+    reasons = ["det-table"]
+    seen = set(reasons)
+    for reason in detect_candidate_reasons(
+        "\n".join(part for part in [context_before, table_html, context_after] if part)
+    ):
+        if reason not in seen:
+            seen.add(reason)
+            reasons.append(reason)
+    return reasons
+
+
 def resolve_raw_dir(report: ReportInfo, raw_root: Path) -> tuple[Path | None, str]:
     industry_root = raw_root / report.industry_slug
     if not industry_root.is_dir():
@@ -250,6 +388,44 @@ def list_page_pngs(raw_dir: Path | None) -> list[Path]:
     return sorted(p for p in pages_dir.glob("page_*.png") if p.is_file())
 
 
+def resolve_table_source(report: ReportInfo, raw_root: Path) -> TableSourceInfo | None:
+    raw_dir, raw_status = resolve_raw_dir(report, raw_root)
+    if raw_dir is not None:
+        raw_det_mmd = find_det_mmd(raw_dir)
+        raw_mmd = find_mmd(raw_dir)
+        raw_page_pngs = list_page_pngs(raw_dir)
+        if raw_det_mmd is not None and raw_page_pngs:
+            return TableSourceInfo(
+                report_dir=raw_dir,
+                mmd_path=raw_mmd or raw_det_mmd,
+                det_mmd_path=raw_det_mmd,
+                page_pngs=raw_page_pngs,
+                mapping_status=raw_status,
+            )
+
+    local_det_mmd = report.det_mmd_path
+    if local_det_mmd is None:
+        return None
+
+    fallback_page_pngs = list_page_pngs(raw_dir)
+    source_warning = None
+    if raw_dir is not None:
+        source_warning = "table-source-fallback-pruned-det"
+        mapping_status = raw_status
+    else:
+        source_warning = "table-source-no-raw-match"
+        mapping_status = "raw-dir-missing"
+
+    return TableSourceInfo(
+        report_dir=report.report_dir,
+        mmd_path=report.mmd_path,
+        det_mmd_path=local_det_mmd,
+        page_pngs=fallback_page_pngs,
+        mapping_status=mapping_status,
+        source_warning=source_warning,
+    )
+
+
 def page_png_for(page_pngs: list[Path], page_index: int) -> Path | None:
     expected_name = f"page_{page_index:04d}.png"
     for path in page_pngs:
@@ -308,7 +484,12 @@ def page_text_hash(text: str) -> str:
 
 
 def make_mapping_warnings(
-    *, raw_dir: Path | None, page_pngs: list[Path], page_index: int, mmd_page_count: int
+    *,
+    raw_dir: Path | None,
+    page_pngs: list[Path],
+    page_index: int,
+    mmd_page_count: int,
+    extra_warnings: list[str] | None = None,
 ) -> list[str]:
     warnings: list[str] = []
     if raw_dir is None:
@@ -319,6 +500,8 @@ def make_mapping_warnings(
         warnings.append("page-count-mismatch")
     if page_png_for(page_pngs, page_index) is None:
         warnings.append("raw-page-image-missing")
+    if extra_warnings:
+        warnings.extend(extra_warnings)
     return warnings
 
 
@@ -391,10 +574,116 @@ def iter_page_items(
             )
 
 
+def iter_table_items(
+    *,
+    ocr_root: Path,
+    raw_root: Path,
+    limit_reports: int | None = None,
+):
+    reports = discover_reports(ocr_root)
+    if limit_reports is not None:
+        reports = reports[:limit_reports]
+
+    for report in reports:
+        table_source = resolve_table_source(report, raw_root)
+        if table_source is None:
+            continue
+
+        pages = load_pages(table_source.det_mmd_path)
+        raw_dir = table_source.report_dir
+        raw_status = table_source.mapping_status
+        page_pngs = table_source.page_pngs
+        mmd_page_count = len(pages)
+        png_page_count = len(page_pngs)
+        extra_warnings = (
+            [table_source.source_warning] if table_source.source_warning else []
+        )
+
+        for page_index, page_text in enumerate(pages):
+            blocks = parse_det_blocks(page_text)
+            if not blocks:
+                continue
+
+            warnings = make_mapping_warnings(
+                raw_dir=raw_dir,
+                page_pngs=page_pngs,
+                page_index=page_index,
+                mmd_page_count=mmd_page_count,
+                extra_warnings=extra_warnings,
+            )
+            raw_png = page_png_for(page_pngs, page_index)
+            table_index = 0
+
+            for block_index, block in enumerate(blocks):
+                if block.ref_type != "table" or not block.payload:
+                    continue
+
+                context_before = nearby_context(blocks, block_index, direction=-1)
+                context_after = nearby_context(blocks, block_index, direction=1)
+                row_count, col_count = table_dimensions(block.payload)
+                focus_bboxes = [list(box) for box in block.bboxes]
+                focus_bbox = combined_bbox(focus_bboxes)
+                reasons = detect_table_reasons(
+                    block.payload,
+                    context_before=context_before,
+                    context_after=context_after,
+                )
+                item_id = (
+                    f"{report.industry_slug}/{report.name}/page_{page_index:04d}"
+                    f"/table_{table_index:03d}"
+                )
+                preview_parts = [
+                    context_before,
+                    strip_html(block.payload),
+                    context_after,
+                ]
+                yield PageItem(
+                    item_id=item_id,
+                    industry_slug=report.industry_slug,
+                    report_name=report.name,
+                    exchange=report.exchange,
+                    ticker=report.ticker,
+                    year=report.year,
+                    page_index=page_index,
+                    page_number=page_index + 1,
+                    ocr_root=str(ocr_root),
+                    raw_root=str(raw_root),
+                    report_dir=str(table_source.report_dir),
+                    raw_dir=str(raw_dir) if raw_dir else None,
+                    mmd_path=str(table_source.mmd_path),
+                    raw_png_path=str(raw_png) if raw_png else None,
+                    mmd_page_count=mmd_page_count,
+                    png_page_count=png_page_count,
+                    mapping_status=raw_status,
+                    mapping_warnings=warnings,
+                    candidate_reasons=reasons,
+                    page_text_sha256=page_text_hash(block.payload),
+                    page_text_chars=len(block.payload),
+                    page_text_preview=text_preview(
+                        "\n".join(part for part in preview_parts if part)
+                    ),
+                    page_text="",
+                    item_kind="table",
+                    det_mmd_path=str(table_source.det_mmd_path),
+                    table_index=table_index,
+                    table_row_count=row_count,
+                    table_col_count=col_count,
+                    focus_bbox=focus_bbox,
+                    focus_bboxes=focus_bboxes,
+                    table_html=block.payload,
+                    context_before=context_before,
+                    context_after=context_after,
+                )
+                table_index += 1
+
+
 def new_summary_state() -> dict[str, Any]:
     return {
         "report_names": set(),
-        "pages_total": 0,
+        "page_keys": set(),
+        "items_total": 0,
+        "page_items_total": 0,
+        "table_items_total": 0,
         "mapping_status_counts": {},
         "mapping_warning_counts": {},
         "candidate_reason_counts": {},
@@ -403,7 +692,12 @@ def new_summary_state() -> dict[str, Any]:
 
 def update_summary_state(state: dict[str, Any], item: PageItem) -> None:
     state["report_names"].add(item.report_name)
-    state["pages_total"] += 1
+    state["page_keys"].add((item.report_name, item.page_index))
+    state["items_total"] += 1
+    if item.item_kind == "table":
+        state["table_items_total"] += 1
+    else:
+        state["page_items_total"] += 1
     statuses = state["mapping_status_counts"]
     statuses[item.mapping_status] = statuses.get(item.mapping_status, 0) + 1
     warnings = state["mapping_warning_counts"]
@@ -419,9 +713,14 @@ def finish_summary_state(
 ) -> dict[str, Any]:
     return {
         "reports_total": len(state["report_names"]),
-        "pages_total": state["pages_total"],
+        "pages_total": len(state["page_keys"]),
+        "items_total": state["items_total"],
+        "page_items_total": state["page_items_total"],
+        "table_items_total": state["table_items_total"],
         "queue_reports": len({item.report_name for item in queue}),
-        "queue_pages": len(queue),
+        "queue_pages": len({(item.report_name, item.page_index) for item in queue}),
+        "queue_items": len(queue),
+        "queue_table_items": sum(1 for item in queue if item.item_kind == "table"),
         "mapping_status_counts": state["mapping_status_counts"],
         "mapping_warning_counts": state["mapping_warning_counts"],
         "candidate_reason_counts": state["candidate_reason_counts"],
@@ -440,6 +739,19 @@ def select_queue(
         selected = list(items)
     elif queue_mode == "table-candidates":
         selected = [item for item in items if item.candidate_reasons]
+    elif queue_mode == "tables":
+        selected = list(items)
+        if sample_size is not None:
+            rng = random.Random(seed)
+            selected = rng.sample(selected, min(sample_size, len(selected)))
+            selected.sort(
+                key=lambda item: (
+                    item.industry_slug,
+                    item.report_name,
+                    item.page_index,
+                    item.table_index or -1,
+                )
+            )
     elif queue_mode == "sample":
         size = sample_size if sample_size is not None else 100
         rng = random.Random(seed)
@@ -459,13 +771,13 @@ def build_queue(
     *,
     ocr_root: Path,
     raw_root: Path,
-    queue_mode: str = "table-candidates",
+    queue_mode: str = "tables",
     sample_size: int | None = None,
     seed: int = 17,
     limit: int | None = None,
     limit_reports: int | None = None,
 ) -> tuple[list[PageItem], dict[str, Any]]:
-    if queue_mode not in {"all", "table-candidates", "sample"}:
+    if queue_mode not in {"all", "table-candidates", "sample", "tables"}:
         raise ValueError(f"unknown queue mode: {queue_mode}")
 
     queue: list[PageItem] = []
@@ -474,14 +786,17 @@ def build_queue(
     sample_seen = 0
     sample_target = sample_size if sample_size is not None else 100
     scan_stopped_by_limit = False
+    item_iterator = iter_table_items if queue_mode == "tables" else iter_page_items
 
-    for item in iter_page_items(
+    for item in item_iterator(
         ocr_root=ocr_root,
         raw_root=raw_root,
         limit_reports=limit_reports,
     ):
         update_summary_state(summary_state, item)
-        if queue_mode == "sample":
+        if queue_mode == "sample" or (
+            queue_mode == "tables" and sample_size is not None
+        ):
             sample_seen += 1
             if len(queue) < sample_target:
                 queue.append(item)
@@ -491,7 +806,7 @@ def build_queue(
                     queue[replace_at] = item
             continue
 
-        include_item = queue_mode == "all" or bool(item.candidate_reasons)
+        include_item = queue_mode in {"all", "tables"} or bool(item.candidate_reasons)
         if not include_item:
             continue
         queue.append(item)
@@ -499,9 +814,14 @@ def build_queue(
             scan_stopped_by_limit = True
             break
 
-    if queue_mode == "sample":
+    if queue_mode == "sample" or (queue_mode == "tables" and sample_size is not None):
         queue.sort(
-            key=lambda item: (item.industry_slug, item.report_name, item.page_index)
+            key=lambda item: (
+                item.industry_slug,
+                item.report_name,
+                item.page_index,
+                item.table_index or -1,
+            )
         )
         if limit is not None:
             queue = queue[:limit]
@@ -525,6 +845,7 @@ def build_queue(
 def summarize_items(all_items: list[PageItem], queue: list[PageItem]) -> dict[str, Any]:
     report_names = {item.report_name for item in all_items}
     queue_reports = {item.report_name for item in queue}
+    page_keys = {(item.report_name, item.page_index) for item in all_items}
     warnings: dict[str, int] = {}
     statuses: dict[str, int] = {}
     reason_counts: dict[str, int] = {}
@@ -536,9 +857,12 @@ def summarize_items(all_items: list[PageItem], queue: list[PageItem]) -> dict[st
             reason_counts[reason] = reason_counts.get(reason, 0) + 1
     return {
         "reports_total": len(report_names),
-        "pages_total": len(all_items),
+        "pages_total": len(page_keys),
+        "items_total": len(all_items),
+        "table_items_total": sum(1 for item in all_items if item.item_kind == "table"),
         "queue_reports": len(queue_reports),
-        "queue_pages": len(queue),
+        "queue_pages": len({(item.report_name, item.page_index) for item in queue}),
+        "queue_items": len(queue),
         "mapping_status_counts": statuses,
         "mapping_warning_counts": warnings,
         "candidate_reason_counts": reason_counts,
@@ -558,8 +882,8 @@ def build_arg_parser() -> argparse.ArgumentParser:
     parser.add_argument("--raw-root", type=Path, default=DEFAULT_RAW_ROOT)
     parser.add_argument(
         "--queue-mode",
-        choices=["all", "table-candidates", "sample"],
-        default="table-candidates",
+        choices=["all", "table-candidates", "sample", "tables"],
+        default="tables",
     )
     parser.add_argument("--sample-size", type=int, default=None)
     parser.add_argument("--seed", type=int, default=17)
diff --git a/annotation_OCR/server.py b/annotation_OCR/server.py
index 16460d5..727a04a 100644
--- a/annotation_OCR/server.py
+++ b/annotation_OCR/server.py
@@ -1,8 +1,9 @@
-"""Browser-based OCR page annotation server."""
+"""Browser-based OCR annotation server."""
 
 from __future__ import annotations
 
 import argparse
+import json
 import re
 from functools import lru_cache
 from pathlib import Path
@@ -26,6 +27,7 @@
 
 
 HERE = Path(__file__).resolve().parent
+DEFAULT_TABLE_MANIFEST = HERE / "manifests" / "tables_5000.json"
 IMAGE_REF_RE = re.compile(r"(!\[[^\]]*\]\()((?:\./)?images/[^)\s]+)(\))")
 
 ALLOWED_TAGS = set(bleach.sanitizer.ALLOWED_TAGS).union(
@@ -71,14 +73,26 @@ def build_arg_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--session-name", default="OCR annotation session")
     parser.add_argument("--annotator", default="anonymous")
+    parser.add_argument(
+        "--study-bundle",
+        type=Path,
+        default=None,
+        help="Optional per-session study bundle. When set, each new session gets the next precomputed session queue.",
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default=DEFAULT_TABLE_MANIFEST if DEFAULT_TABLE_MANIFEST.is_file() else None,
+        help="Optional precomputed queue manifest to reuse instead of rescanning OCR files.",
+    )
     parser.add_argument(
         "--queue-mode",
-        choices=["all", "table-candidates", "sample"],
-        default="table-candidates",
+        choices=["all", "table-candidates", "sample", "tables"],
+        default="tables",
     )
-    parser.add_argument("--sample-size", type=int, default=None)
+    parser.add_argument("--sample-size", type=int, default=5000)
     parser.add_argument("--seed", type=int, default=17)
-    parser.add_argument("--limit", type=int, default=None, help="Maximum queued pages.")
+    parser.add_argument("--limit", type=int, default=None, help="Maximum queued items.")
     parser.add_argument(
         "--limit-reports",
         type=int,
@@ -96,7 +110,9 @@ def prepare_session(args: argparse.Namespace) -> str:
         metadata = load_metadata(args.session_id)
         return metadata["session_id"]
 
-    queue, index_summary = build_queue(
+    manifest_items, index_summary, study_config = resolve_session_source(
+        study_bundle_path=args.study_bundle,
+        manifest_path=args.manifest_path,
         ocr_root=args.ocr_root,
         raw_root=args.raw_root,
         queue_mode=args.queue_mode,
@@ -108,16 +124,21 @@ def prepare_session(args: argparse.Namespace) -> str:
     config = {
         "ocr_root": str(args.ocr_root),
         "raw_root": str(args.raw_root),
+        "study_bundle_path": str(args.study_bundle.resolve())
+        if args.study_bundle
+        else None,
+        "manifest_path": str(args.manifest_path) if args.manifest_path else None,
         "queue_mode": args.queue_mode,
         "sample_size": args.sample_size,
         "seed": args.seed,
         "limit": args.limit,
         "limit_reports": args.limit_reports,
+        **study_config,
     }
     metadata = create_session(
         session_name=args.session_name,
         annotator=args.annotator,
-        manifest_items=[item.to_manifest_record() for item in queue],
+        manifest_items=manifest_items,
         index_summary=index_summary,
         config=config,
     )
@@ -134,6 +155,123 @@ def cached_manifest(session_id: str) -> tuple[dict[str, Any], ...]:
     return tuple(load_manifest(session_id))
 
 
+def load_precomputed_manifest(
+    manifest_path: Path,
+) -> tuple[list[dict[str, Any]], dict[str, Any]]:
+    payload = json.loads(manifest_path.read_text(encoding="utf-8"))
+    items = payload.get("items")
+    if not isinstance(items, list):
+        raise ValueError(f"invalid manifest items in {manifest_path}")
+    summary = payload.get("summary") or {}
+    if not isinstance(summary, dict):
+        raise ValueError(f"invalid manifest summary in {manifest_path}")
+    summary = {**summary, "manifest_path": str(manifest_path)}
+    return items, summary
+
+
+def load_study_bundle(bundle_path: Path) -> dict[str, Any]:
+    payload = json.loads(bundle_path.read_text(encoding="utf-8"))
+    sessions = payload.get("sessions")
+    if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance(
+        sessions, list
+    ):
+        raise ValueError(f"invalid study bundle in {bundle_path}")
+    return payload
+
+
+def claimed_study_slots(bundle_path: Path) -> set[int]:
+    resolved = str(bundle_path.resolve())
+    claimed: set[int] = set()
+    for metadata in list_sessions():
+        config = metadata.get("config") or {}
+        if config.get("study_bundle_path") != resolved:
+            continue
+        slot = config.get("study_slot")
+        if isinstance(slot, int):
+            claimed.add(slot)
+        elif isinstance(slot, str) and slot.isdigit():
+            claimed.add(int(slot))
+    return claimed
+
+
+def allocate_study_session(
+    bundle_path: Path,
+) -> tuple[list[dict[str, Any]], dict[str, Any], dict[str, Any]]:
+    bundle = load_study_bundle(bundle_path)
+    claimed = claimed_study_slots(bundle_path)
+    sessions = bundle["sessions"]
+    next_session = None
+    for entry in sessions:
+        slot = entry.get("slot")
+        if isinstance(slot, int) and slot not in claimed:
+            next_session = entry
+            break
+    if next_session is None:
+        raise ValueError(f"all study sessions already assigned for {bundle_path}")
+
+    items = next_session.get("items")
+    if not isinstance(items, list):
+        raise ValueError(f"invalid study session items in {bundle_path}")
+    summary = bundle.get("summary") or {}
+    if not isinstance(summary, dict):
+        summary = {}
+    slot = int(next_session["slot"])
+    summary = {
+        **summary,
+        "study_bundle_path": str(bundle_path.resolve()),
+        "study_slot": slot,
+        "study_target_items": next_session.get("target_items"),
+        "study_agreement_items": next_session.get("agreement_items"),
+        "study_single_items": next_session.get("single_items"),
+    }
+    config = {
+        "study_slot": slot,
+        "study_target_items": next_session.get("target_items"),
+        "study_agreement_items": next_session.get("agreement_items"),
+        "study_single_items": next_session.get("single_items"),
+    }
+    return items, summary, config
+
+
+def resolve_session_source(
+    *,
+    study_bundle_path: Path | None,
+    manifest_path: Path | None,
+    ocr_root: Path,
+    raw_root: Path,
+    queue_mode: str,
+    sample_size: int | None,
+    seed: int,
+    limit: int | None,
+    limit_reports: int | None,
+) -> tuple[list[dict[str, Any]], dict[str, Any], dict[str, Any]]:
+    if study_bundle_path is not None:
+        items, summary, config = allocate_study_session(study_bundle_path)
+        if limit is not None:
+            items = items[:limit]
+            summary = {**summary, "limit": limit}
+            config = {**config, "limit": limit}
+        return items, summary, config
+
+    if manifest_path is not None:
+        items, summary = load_precomputed_manifest(manifest_path)
+        if limit is not None:
+            items = items[:limit]
+            summary = {**summary, "limit": limit}
+        return items, summary, {}
+
+    queue, index_summary = build_queue(
+        ocr_root=ocr_root,
+        raw_root=raw_root,
+        queue_mode=queue_mode,
+        sample_size=sample_size,
+        seed=seed,
+        limit=limit,
+        limit_reports=limit_reports,
+    )
+    return [item.to_manifest_record() for item in queue], index_summary, {}
+
+
 def get_item_or_404(session_id: str, index: int) -> dict[str, Any]:
     manifest = cached_manifest(session_id)
     if index < 0 or index >= len(manifest):
@@ -142,6 +280,8 @@ def get_item_or_404(session_id: str, index: int) -> dict[str, Any]:
 
 
 def item_page_text(item: dict[str, Any]) -> str:
+    if item.get("item_kind") == "table":
+        return str(item.get("table_html") or "")
     pages = cached_pages(item["mmd_path"])
     page_index = int(item.get("page_index", 0))
     if page_index < 0 or page_index >= len(pages):
@@ -254,7 +394,17 @@ def api_create_session() -> Any:
         payload = request.get_json(force=True, silent=True) or {}
         defaults = app.config["BUILD_DEFAULTS"]
         queue_mode = payload.get("queue_mode") or defaults["queue_mode"]
-        queue, index_summary = build_queue(
+        study_bundle_value = payload.get("study_bundle_path") or defaults.get(
+            "study_bundle_path"
+        )
+        study_bundle_path = Path(study_bundle_value) if study_bundle_value else None
+        manifest_path_value = payload.get("manifest_path") or defaults.get(
+            "manifest_path"
+        )
+        manifest_path = Path(manifest_path_value) if manifest_path_value else None
+        manifest_items, index_summary, study_config = resolve_session_source(
+            study_bundle_path=study_bundle_path,
+            manifest_path=manifest_path,
             ocr_root=Path(payload.get("ocr_root") or defaults["ocr_root"]),
             raw_root=Path(payload.get("raw_root") or defaults["raw_root"]),
             queue_mode=queue_mode,
@@ -266,6 +416,10 @@ def api_create_session() -> Any:
         config = {
             "ocr_root": payload.get("ocr_root") or defaults["ocr_root"],
             "raw_root": payload.get("raw_root") or defaults["raw_root"],
+            "study_bundle_path": str(study_bundle_path.resolve())
+            if study_bundle_path
+            else None,
+            "manifest_path": str(manifest_path) if manifest_path else None,
             "queue_mode": queue_mode,
             "sample_size": payload.get("sample_size", defaults.get("sample_size")),
             "seed": int(payload.get("seed", defaults["seed"])),
@@ -273,11 +427,12 @@ def api_create_session() -> Any:
             "limit_reports": payload.get(
                 "limit_reports", defaults.get("limit_reports")
             ),
+            **study_config,
         }
         metadata = create_session(
             session_name=str(payload.get("session_name") or "OCR annotation session"),
             annotator=str(payload.get("annotator") or "anonymous"),
-            manifest_items=[item.to_manifest_record() for item in queue],
+            manifest_items=manifest_items,
             index_summary=index_summary,
             config=config,
         )
@@ -389,6 +544,10 @@ def main(argv: list[str] | None = None) -> int:
     build_defaults = {
         "ocr_root": str(args.ocr_root),
         "raw_root": str(args.raw_root),
+        "study_bundle_path": str(args.study_bundle.resolve())
+        if args.study_bundle
+        else None,
+        "manifest_path": str(args.manifest_path) if args.manifest_path else None,
         "queue_mode": args.queue_mode,
         "sample_size": args.sample_size,
         "seed": args.seed,
diff --git a/annotation_OCR/static/app.js b/annotation_OCR/static/app.js
index 880439d..d9a18c0 100644
--- a/annotation_OCR/static/app.js
+++ b/annotation_OCR/static/app.js
@@ -14,6 +14,12 @@ const state = {
     prefetchImage: null,
 };
 
+const IMAGE_STAGE_PADDING = 16;
+const DET_COORD_MAX = 999;
+const FOCUS_VIEWPORT_MARGIN = 12;
+const FOCUS_BOX_OVERSCAN_X = 1.06;
+const FOCUS_BOX_OVERSCAN_Y = 1.08;
+
 const els = {
     sessionTitle: document.getElementById('sessionTitle'),
     sessionMeta: document.getElementById('sessionMeta'),
@@ -23,8 +29,10 @@ const els = {
     nextButton: document.getElementById('nextButton'),
     skipReviewedButton: document.getElementById('skipReviewedButton'),
     helpButton: document.getElementById('helpButton'),
+    imageStage: document.getElementById('imageStage'),
     imageCanvas: document.getElementById('imageCanvas'),
     rawImage: document.getElementById('rawImage'),
+    imageOverlay: document.getElementById('imageOverlay'),
     imageMissing: document.getElementById('imageMissing'),
     imageSubtitle: document.getElementById('imageSubtitle'),
     markdownSubtitle: document.getElementById('markdownSubtitle'),
@@ -35,6 +43,7 @@ const els = {
     zoomOutButton: document.getElementById('zoomOutButton'),
     zoomResetButton: document.getElementById('zoomResetButton'),
     zoomInButton: document.getElementById('zoomInButton'),
+    refocusButton: document.getElementById('refocusButton'),
     reportName: document.getElementById('reportName'),
     industryValue: document.getElementById('industryValue'),
     tickerValue: document.getElementById('tickerValue'),
@@ -108,7 +117,14 @@ function fittedImageWidth() {
 }
 
 function applyZoom() {
-    els.imageCanvas.style.setProperty('--image-width', `${Math.round(fittedImageWidth() * state.zoom)}px`);
+    const placement = imagePlacement();
+    els.imageCanvas.style.setProperty('--canvas-width', `${Math.round(placement.canvasWidth)}px`);
+    els.imageCanvas.style.setProperty('--canvas-height', `${Math.round(placement.canvasHeight)}px`);
+    els.imageCanvas.style.setProperty('--image-left', `${Math.round(placement.left)}px`);
+    els.imageCanvas.style.setProperty('--image-top', `${Math.round(placement.top)}px`);
+    els.imageCanvas.style.setProperty('--image-width', `${Math.round(placement.width)}px`);
+    els.imageCanvas.style.setProperty('--image-height', `${Math.round(placement.height)}px`);
+    renderFocusOverlay();
     els.zoomResetButton.textContent = `${Math.round(state.zoom * 100)}%`;
 }
 
@@ -117,6 +133,191 @@ function setZoom(value) {
     applyZoom();
 }
 
+function scheduleAfterLayout(callback) {
+    window.requestAnimationFrame(() => {
+        callback();
+    });
+}
+
+function hasTableFocus(item = state.item) {
+    return item?.item_kind === 'table'
+        && Array.isArray(item.focus_bbox)
+        && item.focus_bbox.length === 4
+        && item.focus_bbox.every((value) => Number.isFinite(value));
+}
+
+function baseFittedImageSize() {
+    const width = fittedImageWidth();
+    const naturalWidth = els.rawImage.naturalWidth || 1;
+    const naturalHeight = els.rawImage.naturalHeight || Math.max(1, naturalWidth * 1.414);
+    return {
+        width,
+        height: width * (naturalHeight / naturalWidth),
+    };
+}
+
+function scaledImageSize() {
+    const baseSize = baseFittedImageSize();
+    return {
+        width: baseSize.width * state.zoom,
+        height: baseSize.height * state.zoom,
+    };
+}
+
+function imagePlacement() {
+    const { width, height } = scaledImageSize();
+    const stageWidth = Math.max(1, els.imageStage.clientWidth);
+    const stageHeight = Math.max(1, els.imageStage.clientHeight);
+    const paddedWidth = width + (IMAGE_STAGE_PADDING * 2);
+    const paddedHeight = height + (IMAGE_STAGE_PADDING * 2);
+    const canvasWidth = Math.max(stageWidth, paddedWidth);
+    const canvasHeight = Math.max(stageHeight, paddedHeight);
+
+    return {
+        width,
+        height,
+        canvasWidth,
+        canvasHeight,
+        left: IMAGE_STAGE_PADDING + Math.max(0, (canvasWidth - paddedWidth) / 2),
+        top: IMAGE_STAGE_PADDING + Math.max(0, (canvasHeight - paddedHeight) / 2),
+    };
+}
+
+function tableFocusPoint() {
+    if (!hasTableFocus()) return null;
+    const [left, top, right, bottom] = state.item.focus_bbox;
+    return {
+        x: ((left + right) / 2) / DET_COORD_MAX,
+        y: ((top + bottom) / 2) / DET_COORD_MAX,
+    };
+}
+
+function viewportCenterPoint() {
+    if (!els.rawImage.naturalWidth || !els.rawImage.naturalHeight) {
+        return { x: 0.5, y: 0.5 };
+    }
+    const placement = imagePlacement();
+    return {
+        x: (els.imageStage.scrollLeft + (els.imageStage.clientWidth / 2) - placement.left) / placement.width,
+        y: (els.imageStage.scrollTop + (els.imageStage.clientHeight / 2) - placement.top) / placement.height,
+    };
+}
+
+function centerViewportOnPoint(point) {
+    if (!point || !els.rawImage.naturalWidth || !els.rawImage.naturalHeight) return;
+    const placement = imagePlacement();
+    els.imageStage.scrollLeft = Math.max(
+        0,
+        placement.left + (point.x * placement.width) - (els.imageStage.clientWidth / 2),
+    );
+    els.imageStage.scrollTop = Math.max(
+        0,
+        placement.top + (point.y * placement.height) - (els.imageStage.clientHeight / 2),
+    );
+}
+
+function tableBoxes() {
+    if (Array.isArray(state.item?.focus_bboxes) && state.item.focus_bboxes.length > 0) {
+        return state.item.focus_bboxes;
+    }
+    if (Array.isArray(state.item?.focus_bbox) && state.item.focus_bbox.length === 4) {
+        return [state.item.focus_bbox];
+    }
+    return [];
+}
+
+function bboxToDisplayRect(bbox) {
+    const placement = imagePlacement();
+    return {
+        left: placement.left + (bbox[0] / DET_COORD_MAX) * placement.width,
+        top: placement.top + (bbox[1] / DET_COORD_MAX) * placement.height,
+        width: Math.max(1, ((bbox[2] - bbox[0]) / DET_COORD_MAX) * placement.width),
+        height: Math.max(1, ((bbox[3] - bbox[1]) / DET_COORD_MAX) * placement.height),
+    };
+}
+
+function clearFocusOverlay() {
+    els.imageOverlay.replaceChildren();
+    els.imageOverlay.hidden = true;
+}
+
+function renderFocusOverlay() {
+    if (!hasTableFocus() || !els.rawImage.naturalWidth || !els.rawImage.naturalHeight || els.rawImage.hidden) {
+        clearFocusOverlay();
+        return;
+    }
+
+    const boxes = tableBoxes().map((bbox) => {
+        const rect = bboxToDisplayRect(bbox);
+        const box = document.createElement('div');
+        box.className = 'focus-box';
+        box.style.left = `${rect.left}px`;
+        box.style.top = `${rect.top}px`;
+        box.style.width = `${rect.width}px`;
+        box.style.height = `${rect.height}px`;
+        return box;
+    });
+
+    els.imageOverlay.replaceChildren(...boxes);
+    els.imageOverlay.hidden = false;
+}
+
+function zoomAroundPoint(value, point) {
+    setZoom(value);
+    scheduleAfterLayout(() => centerViewportOnPoint(point));
+}
+
+function adjustZoom(delta) {
+    const nextZoom = state.zoom + delta;
+    const anchorPoint = tableFocusPoint() || viewportCenterPoint();
+    zoomAroundPoint(nextZoom, anchorPoint);
+}
+
+function clamp(value, min, max) {
+    return Math.min(max, Math.max(min, value));
+}
+
+function focusCurrentItem({ resetZoom = true } = {}) {
+    if (!hasTableFocus() || !els.rawImage.naturalWidth || !els.rawImage.naturalHeight) {
+        if (resetZoom) setZoom(1);
+        return;
+    }
+
+    const [left, top, right, bottom] = state.item.focus_bbox;
+    const boxWidthRatio = Math.max(1 / DET_COORD_MAX, (right - left) / DET_COORD_MAX);
+    const boxHeightRatio = Math.max(1 / DET_COORD_MAX, (bottom - top) / DET_COORD_MAX);
+    const baseSize = baseFittedImageSize();
+    const focusPoint = tableFocusPoint();
+
+    if (resetZoom) {
+        const availableWidth = Math.max(
+            180,
+            els.imageStage.clientWidth - FOCUS_VIEWPORT_MARGIN,
+        );
+        const availableHeight = Math.max(
+            180,
+            els.imageStage.clientHeight - FOCUS_VIEWPORT_MARGIN,
+        );
+        const paddedWidth = Math.max(
+            24,
+            boxWidthRatio * baseSize.width * FOCUS_BOX_OVERSCAN_X,
+        );
+        const paddedHeight = Math.max(
+            24,
+            boxHeightRatio * baseSize.height * FOCUS_BOX_OVERSCAN_Y,
+        );
+        const targetZoom = clamp(
+            Math.min(availableWidth / paddedWidth, availableHeight / paddedHeight),
+            0.35,
+            3,
+        );
+        zoomAroundPoint(targetZoom, focusPoint);
+        return;
+    }
+
+    scheduleAfterLayout(() => centerViewportOnPoint(focusPoint));
+}
+
 async function loadProgress() {
     const progress = await apiJson(`/api/session/${state.sessionId}`);
     updateProgress(progress);
@@ -147,7 +348,9 @@ async function loadItem(index) {
     els.reportName.textContent = data.item.report_name;
     els.industryValue.textContent = data.item.industry_slug;
     els.tickerValue.textContent = `${data.item.exchange}:${data.item.ticker} · ${data.item.year}`;
-    els.pageValue.textContent = `${data.item.page_number} / ${data.item.mmd_page_count}`;
+    els.pageValue.textContent = data.item.item_kind === 'table'
+        ? `${data.item.page_number} / ${data.item.mmd_page_count} · Table ${(data.item.table_index ?? 0) + 1}`
+        : `${data.item.page_number} / ${data.item.mmd_page_count}`;
     els.signalsValue.textContent = formatList(data.item.candidate_reasons);
     els.mappingValue.textContent = [data.item.mapping_status, ...data.item.mapping_warnings].filter(Boolean).join(' · ');
     els.imageSubtitle.textContent = data.item.raw_png_path || 'No raw image path';
@@ -156,6 +359,7 @@ async function loadItem(index) {
     els.markdownPreview.innerHTML = data.markdown_html || '';
     els.rawMarkdown.textContent = data.page_text || '';
     resetExtractedContentScroll();
+    clearFocusOverlay();
 
     if (data.item.raw_png_path) {
         els.rawImage.hidden = false;
@@ -166,10 +370,11 @@ async function loadItem(index) {
         els.rawImage.hidden = true;
         els.imageMissing.hidden = false;
         els.rawImage.removeAttribute('src');
+        setZoom(1);
+        clearFocusOverlay();
     }
 
     loadAnnotation(data.annotation);
-    setZoom(1);
     statusMessage(`Loaded item ${safeIndex + 1} of ${data.item_count}`);
     els.prevButton.disabled = safeIndex === 0;
     els.nextButton.disabled = safeIndex >= data.item_count - 1;
@@ -254,12 +459,17 @@ function setupEvents() {
         loadItem(state.index);
     });
     els.toggleRawButton.addEventListener('click', toggleRawMarkdown);
-    els.zoomOutButton.addEventListener('click', () => setZoom(state.zoom - 0.15));
-    els.zoomInButton.addEventListener('click', () => setZoom(state.zoom + 0.15));
-    els.zoomResetButton.addEventListener('click', () => setZoom(1));
+    els.zoomOutButton.addEventListener('click', () => adjustZoom(-0.15));
+    els.zoomInButton.addEventListener('click', () => adjustZoom(0.15));
+    els.zoomResetButton.addEventListener('click', () => focusCurrentItem({ resetZoom: true }));
+    els.refocusButton.addEventListener('click', () => focusCurrentItem({ resetZoom: true }));
     els.helpButton.addEventListener('click', () => els.helpDialog.showModal());
-    els.rawImage.addEventListener('load', () => setZoom(1));
-    window.addEventListener('resize', applyZoom);
+    els.rawImage.addEventListener('load', () => focusCurrentItem({ resetZoom: true }));
+    window.addEventListener('resize', () => {
+        const anchorPoint = tableFocusPoint() || viewportCenterPoint();
+        applyZoom();
+        scheduleAfterLayout(() => centerViewportOnPoint(anchorPoint));
+    });
     document.querySelectorAll('.status-button').forEach((button) => {
         button.addEventListener('click', () => quickMark(button.dataset.status, 'button'));
     });
@@ -286,13 +496,16 @@ function setupEvents() {
             go(-1);
         } else if (event.key === '+' || event.key === '=') {
             event.preventDefault();
-            setZoom(state.zoom + 0.15);
+            adjustZoom(0.15);
         } else if (event.key === '-') {
             event.preventDefault();
-            setZoom(state.zoom - 0.15);
+            adjustZoom(-0.15);
         } else if (event.key === '0') {
             event.preventDefault();
-            setZoom(1);
+            focusCurrentItem({ resetZoom: true });
+        } else if (event.key.toLowerCase() === 'f') {
+            event.preventDefault();
+            focusCurrentItem({ resetZoom: true });
         }
     });
 }
diff --git a/annotation_OCR/static/style.css b/annotation_OCR/static/style.css
index a5e8305..24f39fe 100644
--- a/annotation_OCR/static/style.css
+++ b/annotation_OCR/static/style.css
@@ -212,7 +212,6 @@ button:hover,
     position: relative;
     overflow: auto;
     display: block;
-    padding: 16px;
     background:
         linear-gradient(45deg, #dce3e4 25%, transparent 25%),
         linear-gradient(-45deg, #dce3e4 25%, transparent 25%),
@@ -224,16 +223,40 @@ button:hover,
 
 .image-canvas {
     --image-width: 320px;
-    display: flex;
-    justify-content: center;
-    align-items: flex-start;
-    min-width: max(100%, var(--image-width));
-    min-height: 100%;
+    --image-height: 453px;
+    --image-left: 16px;
+    --image-top: 16px;
+    --canvas-width: 100%;
+    --canvas-height: 100%;
+    position: relative;
+    width: var(--canvas-width);
+    min-width: var(--canvas-width);
+    height: var(--canvas-height);
+    min-height: var(--canvas-height);
+}
+
+.image-overlay {
+    position: absolute;
+    inset: 0;
+    pointer-events: none;
+}
+
+.focus-box {
+    position: absolute;
+    border: 1px solid rgba(204, 20, 20, 0.95);
+    border-radius: 4px;
+    background: rgba(204, 20, 20, 0.06);
+    box-shadow:
+        0 0 0 1px rgba(255, 255, 255, 0.8) inset,
+        0 0 0 1px rgba(204, 20, 20, 0.35);
 }
 
 #rawImage {
-    display: block;
+    position: absolute;
+    left: var(--image-left);
+    top: var(--image-top);
     width: var(--image-width);
+    height: var(--image-height);
     max-width: none;
     border: 1px solid #b7c3c5;
     background: white;
diff --git a/annotation_OCR/store.py b/annotation_OCR/store.py
index 32916b6..0d85a6b 100644
--- a/annotation_OCR/store.py
+++ b/annotation_OCR/store.py
@@ -23,6 +23,7 @@
     "session_name",
     "annotator",
     "item_id",
+    "item_kind",
     "industry_slug",
     "report_name",
     "exchange",
@@ -30,6 +31,9 @@
     "year",
     "page_index",
     "page_number",
+    "table_index",
+    "table_row_count",
+    "table_col_count",
     "overall_status",
     "notes",
     "updated_at_utc",
@@ -41,6 +45,8 @@
     "page_text_sha256",
     "raw_png_path",
     "mmd_path",
+    "det_mmd_path",
+    "focus_bbox",
 ]
 
 
@@ -226,6 +232,7 @@ def save_annotation(
         "created_at_utc": now,
         "updated_at_utc": now,
         "item_id": item_id,
+        "item_kind": item.get("item_kind", "page"),
         "industry_slug": item.get("industry_slug"),
         "report_name": item.get("report_name"),
         "exchange": item.get("exchange"),
@@ -233,8 +240,13 @@ def save_annotation(
         "year": item.get("year"),
         "page_index": item.get("page_index"),
         "page_number": item.get("page_number"),
+        "table_index": item.get("table_index"),
+        "table_row_count": item.get("table_row_count"),
+        "table_col_count": item.get("table_col_count"),
         "mmd_path": item.get("mmd_path"),
+        "det_mmd_path": item.get("det_mmd_path"),
         "raw_png_path": item.get("raw_png_path"),
+        "focus_bbox": item.get("focus_bbox"),
         "mapping_status": item.get("mapping_status"),
         "mapping_warnings": item.get("mapping_warnings", []),
         "candidate_reasons": item.get("candidate_reasons", []),
@@ -272,6 +284,7 @@ def summary_rows(session_id: str) -> list[dict[str, Any]]:
                 "session_name": metadata.get("session_name", ""),
                 "annotator": metadata.get("annotator", ""),
                 "item_id": item.get("item_id"),
+                "item_kind": item.get("item_kind", "page"),
                 "industry_slug": item.get("industry_slug"),
                 "report_name": item.get("report_name"),
                 "exchange": item.get("exchange"),
@@ -279,6 +292,9 @@ def summary_rows(session_id: str) -> list[dict[str, Any]]:
                 "year": item.get("year"),
                 "page_index": item.get("page_index"),
                 "page_number": item.get("page_number"),
+                "table_index": item.get("table_index"),
+                "table_row_count": item.get("table_row_count"),
+                "table_col_count": item.get("table_col_count"),
                 "overall_status": annotation.get("overall_status", "unreviewed"),
                 "notes": annotation.get("notes", ""),
                 "updated_at_utc": annotation.get("updated_at_utc", ""),
@@ -290,6 +306,8 @@ def summary_rows(session_id: str) -> list[dict[str, Any]]:
                 "page_text_sha256": item.get("page_text_sha256"),
                 "raw_png_path": item.get("raw_png_path"),
                 "mmd_path": item.get("mmd_path"),
+                "det_mmd_path": item.get("det_mmd_path"),
+                "focus_bbox": json.dumps(item.get("focus_bbox")),
             }
         )
     return rows
diff --git a/annotation_OCR/templates/index.html b/annotation_OCR/templates/index.html
index 4254140..bc13f29 100644
--- a/annotation_OCR/templates/index.html
+++ b/annotation_OCR/templates/index.html
@@ -47,11 +47,13 @@
                     <button id="zoomOutButton" type="button" title="Zoom out">-</button>
                     <button id="zoomResetButton" type="button" title="Reset zoom">100%</button>
                     <button id="zoomInButton" type="button" title="Zoom in">+</button>
+                    <button id="refocusButton" type="button" title="Center the detected table">Refocus</button>
                 </div>
             </div>
             <div class="image-stage" id="imageStage">
                 <div class="image-canvas" id="imageCanvas">
                     <img id="rawImage" alt="Raw OCR source page" decoding="async" fetchpriority="high">
+                    <div id="imageOverlay" class="image-overlay" aria-hidden="true" hidden></div>
                 </div>
                 <div id="imageMissing" class="missing-state" hidden>Raw image unavailable</div>
             </div>
@@ -143,6 +145,8 @@ <h2>Keyboard</h2>
                 <p>Next / previous</p>
                 <span>+ / - / 0</span>
                 <p>Zoom</p>
+                <span>F</span>
+                <p>Refocus on the table</p>
             </div>
         </form>
     </dialog>

From e5ac1b2e43db332a21ef631dcf80e34505387746 Mon Sep 17 00:00:00 2001
From: Charles Moslonka <charles.moslonka@artefact.com>
Date: Thu, 28 May 2026 15:00:08 +0200
Subject: [PATCH 7/8] ENH: Make study cohortes with definite num of annotators

---
 annotation_OCR/README.md           | 144 +++++-
 annotation_OCR/manifests/README.md |  46 ++
 annotation_OCR/study_agreement.py  | 787 +++++++++++++++++++++++++++++
 annotation_OCR/study_sessions.py   | 277 ++++++++++
 4 files changed, 1236 insertions(+), 18 deletions(-)
 create mode 100644 annotation_OCR/manifests/README.md
 create mode 100644 annotation_OCR/study_agreement.py
 create mode 100644 annotation_OCR/study_sessions.py

diff --git a/annotation_OCR/README.md b/annotation_OCR/README.md
index 11e9641..ebded4e 100644
--- a/annotation_OCR/README.md
+++ b/annotation_OCR/README.md
@@ -1,13 +1,22 @@
 # OCR Annotation Interface
 
-Browser interface for comparing raw OCR page images with the corresponding Markdown page extracted by DeepSeekOCR. The app stores page-level annotations under `annotation_OCR/sessions/` so quality labels can later be joined to LLM benchmark outputs.
+Browser interface for reviewing OCR table extraction quality. The app now
+defaults to table-level items extracted from `*_det.mmd`, shows the isolated
+HTML table in the extracted-content pane, and auto-centers the raw page image
+on the detected table region while still allowing manual zoom-out for more
+context.
+
+Annotations are stored under `annotation_OCR/sessions/` so quality labels can
+later be joined to downstream benchmark outputs.
 
 ## Run
 
 ### Headless mode (recommended for multi-user)
 
 Start the server with no session arguments — annotators create/resume sessions
-from the browser landing page:
+from the browser landing page. If `annotation_OCR/manifests/tables_5000.json`
+exists, the server uses it automatically for fast session creation. Otherwise
+it falls back to building a sampled table queue directly from the OCR corpus.
 
 ```bash
 uv run python annotation_OCR/server.py --host 0.0.0.0 --port 5050
@@ -25,7 +34,8 @@ From the repository root:
 uv run python annotation_OCR/server.py \
   --session-name "table QA smoke" \
   --annotator "your-name" \
-  --queue-mode table-candidates \
+  --queue-mode tables \
+  --sample-size 100 \
   --host 127.0.0.1 \
   --port 5050
 ```
@@ -36,13 +46,35 @@ For a small smoke run:
 uv run python annotation_OCR/server.py \
   --session-name smoke \
   --annotator test \
-  --queue-mode table-candidates \
+  --queue-mode tables \
+  --sample-size 20 \
   --limit-reports 2 \
-  --limit 20 \
   --host 127.0.0.1 \
   --port 5050
 ```
 
+To force the server to use an explicit precomputed manifest:
+
+```bash
+uv run python annotation_OCR/server.py \
+  --manifest-path annotation_OCR/manifests/tables_5000.json \
+  --host 127.0.0.1 \
+  --port 5050
+```
+
+To use precomputed study-session bundles for a paper annotation round:
+
+```bash
+uv run python annotation_OCR/server.py \
+  --study-bundle annotation_OCR/manifests/study_sessions_15.json \
+  --host 127.0.0.1 \
+  --port 5050
+```
+
+Each new session created from the landing page then receives the next fixed
+session queue from that bundle, so the progress bar tracks a real per-annotator
+target rather than the whole table pool.
+
 Resume an existing session:
 
 ```bash
@@ -57,7 +89,71 @@ ssh -L 5050:127.0.0.1:5050 USER@SERVER
 
 Then open `http://127.0.0.1:5050` locally.
 
-The extracted-content pane shows inline OCR images by default. Turn off `Inline images` if you want a lighter placeholder-only Markdown preview.
+For table sessions, the extracted-content pane shows only the isolated table and
+the raw-image pane auto-refocuses on the detected bounding box. Use `Refocus`
+or press `F` to jump back to the table after manual exploration.
+
+## Precompute A Reusable 5,000-Table Manifest
+
+Build the reusable subset once offline:
+
+```bash
+mkdir -p annotation_OCR/manifests
+
+uv run python annotation_OCR/ocr_index.py \
+  --queue-mode tables \
+  --sample-size 5000 \
+  --seed 42 \
+  --output annotation_OCR/manifests/tables_5000.json
+```
+
+That manifest can then be reused by the server so new annotation sessions do
+not need to rescan the OCR corpus.
+
+## Build Study Session Bundles
+
+For hybrid annotation rounds, build one bundle for each possible annotator
+count. The generated bundles already keep each session inside the target range
+of 120 to 140 items:
+
+```bash
+uv run python annotation_OCR/study_sessions.py \
+  --source-manifest annotation_OCR/manifests/tables_5000.json \
+  --output-dir annotation_OCR/manifests \
+  --annotators 14 15 16 \
+  --seed 42
+```
+
+This writes:
+
+- `annotation_OCR/manifests/study_sessions_14.json`
+- `annotation_OCR/manifests/study_sessions_15.json`
+- `annotation_OCR/manifests/study_sessions_16.json`
+
+The 15- and 16-annotator bundles use 1500 unique tables with 300 triple-coded
+agreement tables. The 14-annotator bundle lowers the agreement subset to 220 so
+all session quotas still stay within the 120 to 140 target range.
+
+## Compute Agreement After Annotation
+
+After the study round, compute overlap agreement plus accept/reject ratios with:
+
+```bash
+uv run python annotation_OCR/study_agreement.py \
+  --study-bundle annotation_OCR/manifests/study_sessions_15.json
+```
+
+By default this writes analysis artifacts under:
+
+- `annotation_OCR/sessions/study_analysis/study_sessions_15/summary.md`
+- `annotation_OCR/sessions/study_analysis/study_sessions_15/summary.json`
+- `annotation_OCR/sessions/study_analysis/study_sessions_15/session_metrics.csv`
+- `annotation_OCR/sessions/study_analysis/study_sessions_15/item_metrics.csv`
+
+The script auto-discovers sessions created from that bundle via their stored
+`study_bundle_path` and `study_slot`. It reports exact agreement, pairwise
+agreement, Fleiss' kappa, and accept/reject ratios both at the raw vote level
+and at the final table-decision level.
 
 ## Data Sources
 
@@ -65,14 +161,20 @@ Defaults:
 
 - OCR Markdown root: `DeepSeekOCR_Ardian_pruned_1k/`
 - Raw image root: `/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs/`
+- Default reusable manifest path: `annotation_OCR/manifests/tables_5000.json`
 
-Each queued item maps one `.mmd` page split to the raw PNG with the same zero-based page index, for example page index `12` maps to `pages/page_0012.png`. The manifest records mapping warnings such as missing raw images or page-count mismatches.
+Each queued table item maps back to the raw PNG page with the same zero-based
+page index, for example page index `12` maps to `pages/page_0012.png`. Table
+items carry the `_det.mmd` bounding box used by the UI to center the preview.
+The manifest records mapping warnings such as missing raw images or page-count
+mismatches.
 
 ## Queue Modes
 
-- `table-candidates`: default. Keeps pages with table-like signals, dense numeric rows, financial statement headings, or KPI aliases.
-- `all`: queues every page.
-- `sample`: seeded random sample across all discovered pages. Use `--sample-size` and `--seed`.
+- `tables`: default. Queues table-level items from `*_det.mmd`. Use `--sample-size` for deterministic random sampling.
+- `table-candidates`: legacy page-level mode. Keeps pages with table-like signals, dense numeric rows, financial statement headings, or KPI aliases.
+- `all`: legacy page-level mode that queues every page.
+- `sample`: legacy seeded random sample across all discovered pages.
 
 Indexer smoke check:
 
@@ -80,9 +182,9 @@ Indexer smoke check:
 uv run python annotation_OCR/ocr_index.py \
   --ocr-root DeepSeekOCR_Ardian_pruned_1k \
   --raw-root /data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs \
-  --queue-mode table-candidates \
+  --queue-mode tables \
+  --sample-size 20 \
   --limit-reports 2 \
-  --limit 20 \
   --check
 ```
 
@@ -93,7 +195,8 @@ uv run python annotation_OCR/ocr_index.py \
 - `u`: mark Uncertain, save, advance
 - `j` / right arrow: next page
 - `k` / left arrow: previous page
-- `+`, `-`, `0`: zoom controls
+- `+`, `-`, `0`: zoom / reset
+- `f`: refocus on the detected table
 - `?`: shortcut dialog
 
 Shortcuts are disabled while typing in notes or editing form controls.
@@ -103,10 +206,10 @@ Shortcuts are disabled while typing in notes or editing form controls.
 Each session writes to `annotation_OCR/sessions/{session_id}/`:
 
 - `metadata.json`: session name, annotator, configuration, counts, timestamps.
-- `manifest.json`: queued pages and mapping diagnostics.
+- `manifest.json`: queued items and mapping diagnostics.
 - `annotations.jsonl`: append-only event log, one saved annotation per line.
 - `current_annotations.json`: latest annotation per item, written atomically.
-- `summary.csv`: one row per queued page, including unreviewed pages.
+- `summary.csv`: one row per queued item, including unreviewed items.
 - `summary.md`: status-count overview.
 
 Regenerate summaries:
@@ -125,12 +228,15 @@ Primary fields:
 
 Identity fields include `industry_slug`, `report_name`, `exchange`, `ticker`, `year`, `page_index`, `page_number`, `mmd_path`, `raw_png_path`, and `page_text_sha256`.
 
+For table sessions, summary rows also include `item_kind`, `table_index`,
+`table_row_count`, `table_col_count`, `det_mmd_path`, and `focus_bbox`.
+
 ## Downstream Joins
 
-For page-level filtering, join annotation summaries on:
+For table-level filtering, join annotation summaries on:
 
 ```text
-exchange, ticker, year, page_index
+exchange, ticker, year, page_index, table_index
 ```
 
 For report-level benchmark filtering, aggregate page labels to:
@@ -139,4 +245,6 @@ For report-level benchmark filtering, aggregate page labels to:
 exchange, ticker, year
 ```
 
-A conservative report-level rule is to exclude a report when any reviewed table-candidate page is `not_ok`, or when the share of `uncertain` pages exceeds a threshold chosen for the benchmark run.
\ No newline at end of file
+A conservative report-level rule is to exclude a report when any reviewed table
+item is `not_ok`, or when the share of `uncertain` table items exceeds a
+threshold chosen for the benchmark run.
\ No newline at end of file
diff --git a/annotation_OCR/manifests/README.md b/annotation_OCR/manifests/README.md
new file mode 100644
index 0000000..cba2bc8
--- /dev/null
+++ b/annotation_OCR/manifests/README.md
@@ -0,0 +1,46 @@
+# Table Manifests
+
+Place reusable sampled table manifests here.
+
+Recommended default:
+
+```bash
+uv run python annotation_OCR/ocr_index.py \
+  --queue-mode tables \
+  --sample-size 5000 \
+  --seed 42 \
+  --output annotation_OCR/manifests/tables_5000.json
+```
+
+When `tables_5000.json` exists, `annotation_OCR/server.py` will use it by default for new sessions.
+
+## Study Session Bundles
+
+For paper annotation rounds, also build the headcount-specific session bundles:
+
+```bash
+uv run python annotation_OCR/study_sessions.py \
+  --source-manifest annotation_OCR/manifests/tables_5000.json \
+  --output-dir annotation_OCR/manifests \
+  --annotators 14 15 16 \
+  --seed 42
+```
+
+This creates:
+
+- `study_sessions_14.json`
+- `study_sessions_15.json`
+- `study_sessions_16.json`
+
+Use the bundle matching the final annotator count when starting the server:
+
+```bash
+uv run python annotation_OCR/server.py \
+  --study-bundle annotation_OCR/manifests/study_sessions_15.json
+```
+
+Why the 14-annotator bundle differs:
+
+- `1500 unique + 300 triple-coded` requires `2100` total annotations.
+- That fits 15 or 16 annotators while keeping each session in the `120–140` range.
+- For 14 annotators, the bundle uses `220` agreement tables instead, for `1940` total annotations and per-session targets of `138–139`.
diff --git a/annotation_OCR/study_agreement.py b/annotation_OCR/study_agreement.py
new file mode 100644
index 0000000..0e73762
--- /dev/null
+++ b/annotation_OCR/study_agreement.py
@@ -0,0 +1,787 @@
+"""Compute agreement and accept/reject ratios for bundle-backed table studies."""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from collections import Counter
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from math import comb
+from pathlib import Path
+from typing import Any
+
+
+HERE = Path(__file__).resolve().parent
+DEFAULT_SESSIONS_DIR = HERE / "sessions"
+DEFAULT_ANALYSIS_ROOT = DEFAULT_SESSIONS_DIR / "study_analysis"
+REVIEWED_STATUSES = ("ok", "not_ok", "uncertain")
+VALID_STATUSES = set(REVIEWED_STATUSES).union({"unreviewed"})
+
+
+@dataclass(slots=True)
+class SessionPayload:
+    session_id: str
+    session_name: str
+    annotator: str
+    slot: int
+    item_count: int
+    completed_count: int
+    updated_at_utc: str
+    metadata: dict[str, Any]
+    manifest_items: list[dict[str, Any]]
+    current_annotations: dict[str, dict[str, Any]]
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Compute agreement metrics and accept/reject ratios for OCR table study sessions."
+    )
+    parser.add_argument(
+        "--study-bundle",
+        type=Path,
+        required=True,
+        help="Path to the study_sessions_*.json bundle used for the annotation round.",
+    )
+    parser.add_argument(
+        "--sessions-dir",
+        type=Path,
+        default=DEFAULT_SESSIONS_DIR,
+        help="Directory containing annotation_OCR session folders.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=None,
+        help="Output directory for summary artifacts. Defaults to sessions/study_analysis/<bundle-stem>/.",
+    )
+    parser.add_argument(
+        "--session-id",
+        dest="session_ids",
+        nargs="+",
+        default=None,
+        help="Optional explicit session ids to analyze. If omitted, all sessions linked to the study bundle are used.",
+    )
+    parser.add_argument(
+        "--strict-manifest",
+        action="store_true",
+        help="Fail if a selected session manifest does not match its bundle slot.",
+    )
+    return parser
+
+
+def utc_now() -> str:
+    return datetime.now(timezone.utc).isoformat(timespec="seconds")
+
+
+def load_json(path: Path, *, default: Any | None = None) -> Any:
+    if not path.is_file():
+        return default
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def atomic_write_text(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    tmp.write_text(text, encoding="utf-8")
+    tmp.replace(path)
+
+
+def atomic_write_json(path: Path, payload: Any) -> None:
+    atomic_write_text(path, json.dumps(payload, indent=2, ensure_ascii=False) + "\n")
+
+
+def safe_div(numerator: int, denominator: int) -> float | None:
+    if denominator == 0:
+        return None
+    return numerator / denominator
+
+
+def format_ratio(value: float | None) -> str:
+    if value is None:
+        return "n/a"
+    return f"{value * 100:.1f}%"
+
+
+def parse_int(value: Any) -> int | None:
+    if isinstance(value, int):
+        return value
+    if isinstance(value, str) and value.isdigit():
+        return int(value)
+    return None
+
+
+def normalize_status(value: Any) -> str:
+    if isinstance(value, str) and value in VALID_STATUSES:
+        return value
+    return "unreviewed"
+
+
+def load_study_bundle(path: Path) -> dict[str, Any]:
+    payload = load_json(path)
+    sessions = payload.get("sessions") if isinstance(payload, dict) else None
+    if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance(
+        sessions, list
+    ):
+        raise ValueError(f"invalid study bundle in {path}")
+    return payload
+
+
+def build_bundle_index(
+    bundle: dict[str, Any],
+) -> tuple[dict[int, dict[str, Any]], dict[str, dict[str, Any]], list[str]]:
+    slot_index: dict[int, dict[str, Any]] = {}
+    item_index: dict[str, dict[str, Any]] = {}
+    warnings: list[str] = []
+
+    for session in bundle["sessions"]:
+        slot = parse_int(session.get("slot"))
+        items = session.get("items")
+        if slot is None or not isinstance(items, list):
+            raise ValueError("invalid study session entry in bundle")
+        slot_index[slot] = session
+        for item in items:
+            item_id = str(item.get("item_id") or "")
+            if not item_id:
+                raise ValueError(f"bundle slot {slot} contains an item without item_id")
+            expected_votes = parse_int(item.get("study_expected_votes")) or 1
+            study_assignment = str(item.get("study_assignment") or "single")
+            record = item_index.setdefault(
+                item_id,
+                {
+                    "item_id": item_id,
+                    "industry_slug": item.get("industry_slug"),
+                    "report_name": item.get("report_name"),
+                    "exchange": item.get("exchange"),
+                    "ticker": item.get("ticker"),
+                    "year": item.get("year"),
+                    "page_index": item.get("page_index"),
+                    "page_number": item.get("page_number"),
+                    "table_index": item.get("table_index"),
+                    "table_row_count": item.get("table_row_count"),
+                    "table_col_count": item.get("table_col_count"),
+                    "study_assignment": study_assignment,
+                    "expected_votes": expected_votes,
+                    "assigned_slots": [],
+                },
+            )
+            record["assigned_slots"].append(slot)
+            record["expected_votes"] = max(record["expected_votes"], expected_votes)
+            if study_assignment == "agreement":
+                record["study_assignment"] = "agreement"
+
+    for item_id, record in item_index.items():
+        assigned_slots = sorted(record["assigned_slots"])
+        record["assigned_slots"] = assigned_slots
+        occurrence_count = len(assigned_slots)
+        if occurrence_count > 1:
+            record["study_assignment"] = "agreement"
+        if occurrence_count != record["expected_votes"]:
+            warnings.append(
+                f"bundle item {item_id} appears in {occurrence_count} slots but declares study_expected_votes={record['expected_votes']}"
+            )
+
+    return slot_index, item_index, warnings
+
+
+def load_session_payload(sessions_dir: Path, session_id: str) -> SessionPayload:
+    directory = sessions_dir / session_id
+    metadata = load_json(directory / "metadata.json")
+    if not isinstance(metadata, dict):
+        raise FileNotFoundError(f"missing metadata for session {session_id}")
+    manifest = load_json(directory / "manifest.json", default={}) or {}
+    manifest_items = manifest.get("items") if isinstance(manifest, dict) else None
+    if not isinstance(manifest_items, list):
+        raise ValueError(f"invalid manifest for session {session_id}")
+    current_annotations = (
+        load_json(directory / "current_annotations.json", default={}) or {}
+    )
+    if not isinstance(current_annotations, dict):
+        raise ValueError(f"invalid current_annotations for session {session_id}")
+
+    config = metadata.get("config") or {}
+    slot = parse_int(config.get("study_slot"))
+    if slot is None:
+        raise ValueError(f"session {session_id} has no usable study_slot")
+
+    return SessionPayload(
+        session_id=session_id,
+        session_name=str(metadata.get("session_name") or session_id),
+        annotator=str(metadata.get("annotator") or ""),
+        slot=slot,
+        item_count=parse_int(metadata.get("item_count")) or len(manifest_items),
+        completed_count=parse_int(metadata.get("completed_count")) or 0,
+        updated_at_utc=str(metadata.get("updated_at_utc") or ""),
+        metadata=metadata,
+        manifest_items=manifest_items,
+        current_annotations=current_annotations,
+    )
+
+
+def discover_sessions(
+    *,
+    sessions_dir: Path,
+    bundle_path: Path,
+    session_ids: list[str] | None,
+) -> tuple[dict[int, SessionPayload], list[str]]:
+    bundle_resolved = str(bundle_path.resolve())
+    warnings: list[str] = []
+    discovered: dict[int, SessionPayload] = {}
+
+    if session_ids is None:
+        candidate_ids = [
+            path.name for path in sorted(sessions_dir.iterdir()) if path.is_dir()
+        ]
+    else:
+        candidate_ids = session_ids
+
+    for session_id in candidate_ids:
+        metadata = load_json(sessions_dir / session_id / "metadata.json")
+        if not isinstance(metadata, dict):
+            if session_ids is None:
+                continue
+            raise FileNotFoundError(f"missing metadata for session {session_id}")
+
+        config = metadata.get("config") or {}
+        session_bundle = config.get("study_bundle_path")
+        session_slot = parse_int(config.get("study_slot"))
+        if session_ids is None:
+            if session_bundle != bundle_resolved or session_slot is None:
+                continue
+
+        payload = load_session_payload(sessions_dir, session_id)
+        if session_ids is not None and session_bundle not in {None, bundle_resolved}:
+            warnings.append(
+                f"session {session_id} references a different study bundle: {session_bundle}"
+            )
+
+        existing = discovered.get(payload.slot)
+        if existing is None:
+            discovered[payload.slot] = payload
+            continue
+
+        keep = payload
+        drop = existing
+        if (existing.updated_at_utc, existing.session_id) > (
+            payload.updated_at_utc,
+            payload.session_id,
+        ):
+            keep = existing
+            drop = payload
+        discovered[payload.slot] = keep
+        warnings.append(
+            f"multiple sessions claim study slot {payload.slot}; keeping {keep.session_id} and ignoring {drop.session_id}"
+        )
+
+    return discovered, warnings
+
+
+def validate_session_manifest(
+    *,
+    payload: SessionPayload,
+    expected_session: dict[str, Any],
+    strict: bool,
+) -> list[str]:
+    warnings: list[str] = []
+    actual_ids = [str(item.get("item_id") or "") for item in payload.manifest_items]
+    expected_ids = [
+        str(item.get("item_id") or "") for item in expected_session["items"]
+    ]
+    if Counter(actual_ids) != Counter(expected_ids):
+        message = f"session {payload.session_id} manifest does not match bundle slot {payload.slot}"
+        if strict:
+            raise ValueError(message)
+        warnings.append(message)
+    return warnings
+
+
+def status_ratio_block(counts: Counter[str]) -> dict[str, Any]:
+    reviewed = sum(counts.get(status, 0) for status in REVIEWED_STATUSES)
+    decided = counts.get("ok", 0) + counts.get("not_ok", 0)
+    return {
+        "reviewed": reviewed,
+        "decided": decided,
+        "ok": counts.get("ok", 0),
+        "not_ok": counts.get("not_ok", 0),
+        "uncertain": counts.get("uncertain", 0),
+        "ok_rate_all": safe_div(counts.get("ok", 0), reviewed),
+        "not_ok_rate_all": safe_div(counts.get("not_ok", 0), reviewed),
+        "uncertain_rate_all": safe_div(counts.get("uncertain", 0), reviewed),
+        "accept_ratio_decided": safe_div(counts.get("ok", 0), decided),
+        "reject_ratio_decided": safe_div(counts.get("not_ok", 0), decided),
+    }
+
+
+def majority_status(counts: Counter[str], vote_count: int) -> str | None:
+    if vote_count == 0:
+        return None
+    top_count = max(counts.values(), default=0)
+    if top_count * 2 <= vote_count:
+        return None
+    winners = [status for status, count in counts.items() if count == top_count]
+    if len(winners) != 1:
+        return None
+    return winners[0]
+
+
+def compute_pairwise_agreement(item_rows: list[dict[str, Any]]) -> dict[str, Any]:
+    items_considered = 0
+    matching_pairs = 0
+    total_pairs = 0
+    for row in item_rows:
+        vote_count = int(row["vote_count"])
+        if vote_count < 2:
+            continue
+        items_considered += 1
+        total_pairs += comb(vote_count, 2)
+        matching_pairs += sum(
+            comb(int(row[f"{status}_votes"]), 2) for status in REVIEWED_STATUSES
+        )
+    return {
+        "items_considered": items_considered,
+        "pairs_total": total_pairs,
+        "pairs_matching": matching_pairs,
+        "agreement_rate": safe_div(matching_pairs, total_pairs),
+    }
+
+
+def compute_fleiss_kappa(item_rows: list[dict[str, Any]]) -> float | None:
+    if not item_rows:
+        return None
+    n = int(item_rows[0]["expected_votes"])
+    if n < 2:
+        return None
+    if any(int(row["expected_votes"]) != n for row in item_rows):
+        return None
+
+    total_items = len(item_rows)
+    p_i_values: list[float] = []
+    category_totals = Counter[str]()
+    for row in item_rows:
+        row_total = 0
+        squared_sum = 0
+        for status in REVIEWED_STATUSES:
+            count = int(row[f"{status}_votes"])
+            category_totals[status] += count
+            row_total += count
+            squared_sum += count * count
+        if row_total != n:
+            return None
+        p_i_values.append((squared_sum - n) / (n * (n - 1)))
+
+    p_bar = sum(p_i_values) / total_items
+    p_e = 0.0
+    for status in REVIEWED_STATUSES:
+        p_j = category_totals[status] / (total_items * n)
+        p_e += p_j * p_j
+    if p_e == 1.0:
+        return None
+    return (p_bar - p_e) / (1.0 - p_e)
+
+
+def build_analysis(
+    *,
+    bundle_path: Path,
+    sessions_dir: Path,
+    session_ids: list[str] | None,
+    strict_manifest: bool,
+) -> tuple[dict[str, Any], list[dict[str, Any]], list[dict[str, Any]]]:
+    bundle = load_study_bundle(bundle_path)
+    slot_index, item_index, warnings = build_bundle_index(bundle)
+    selected_sessions, session_warnings = discover_sessions(
+        sessions_dir=sessions_dir,
+        bundle_path=bundle_path,
+        session_ids=session_ids,
+    )
+    warnings.extend(session_warnings)
+
+    expected_slots = sorted(slot_index)
+    missing_slots = [slot for slot in expected_slots if slot not in selected_sessions]
+
+    session_rows: list[dict[str, Any]] = []
+    for slot in expected_slots:
+        payload = selected_sessions.get(slot)
+        if payload is None:
+            continue
+        warnings.extend(
+            validate_session_manifest(
+                payload=payload,
+                expected_session=slot_index[slot],
+                strict=strict_manifest,
+            )
+        )
+        slot_status_counts = Counter[str]()
+        for item in slot_index[slot]["items"]:
+            item_id = str(item["item_id"])
+            annotation = payload.current_annotations.get(item_id) or {}
+            slot_status_counts[normalize_status(annotation.get("overall_status"))] += 1
+        reviewed_count = (
+            len(slot_index[slot]["items"]) - slot_status_counts["unreviewed"]
+        )
+        if reviewed_count != payload.completed_count:
+            warnings.append(
+                f"session {payload.session_id} metadata says completed_count={payload.completed_count} but current_annotations implies {reviewed_count}"
+            )
+        status_block = status_ratio_block(slot_status_counts)
+        session_rows.append(
+            {
+                "slot": slot,
+                "session_id": payload.session_id,
+                "session_name": payload.session_name,
+                "annotator": payload.annotator,
+                "item_count": len(slot_index[slot]["items"]),
+                "reviewed_count": reviewed_count,
+                "unreviewed_count": slot_status_counts["unreviewed"],
+                "ok": slot_status_counts["ok"],
+                "not_ok": slot_status_counts["not_ok"],
+                "uncertain": slot_status_counts["uncertain"],
+                "accept_ratio_decided": status_block["accept_ratio_decided"],
+                "reject_ratio_decided": status_block["reject_ratio_decided"],
+                "uncertain_rate_all": status_block["uncertain_rate_all"],
+                "updated_at_utc": payload.updated_at_utc,
+            }
+        )
+
+    vote_level_counts_all = Counter[str]()
+    vote_level_counts_single = Counter[str]()
+    vote_level_counts_agreement = Counter[str]()
+    item_rows: list[dict[str, Any]] = []
+
+    for item_id, record in sorted(item_index.items()):
+        votes: list[dict[str, Any]] = []
+        missing_session_slots_for_item: list[int] = []
+        unreviewed_slots: list[int] = []
+        available_slots: list[int] = []
+
+        for slot in record["assigned_slots"]:
+            payload = selected_sessions.get(slot)
+            if payload is None:
+                missing_session_slots_for_item.append(slot)
+                continue
+            available_slots.append(slot)
+            annotation = payload.current_annotations.get(item_id) or {}
+            status = normalize_status(annotation.get("overall_status"))
+            if status == "unreviewed":
+                unreviewed_slots.append(slot)
+                continue
+            vote = {
+                "slot": slot,
+                "session_id": payload.session_id,
+                "annotator": payload.annotator,
+                "overall_status": status,
+                "updated_at_utc": annotation.get("updated_at_utc", ""),
+            }
+            votes.append(vote)
+            vote_level_counts_all[status] += 1
+            if record["study_assignment"] == "agreement":
+                vote_level_counts_agreement[status] += 1
+            else:
+                vote_level_counts_single[status] += 1
+
+        vote_counts = Counter(vote["overall_status"] for vote in votes)
+        vote_count = len(votes)
+        majority = majority_status(vote_counts, vote_count)
+        is_complete = vote_count == int(record["expected_votes"])
+        is_unanimous = is_complete and len(vote_counts) == 1
+        final_status = None
+        if record["study_assignment"] == "single" and vote_count == 1:
+            final_status = votes[0]["overall_status"]
+        elif record["study_assignment"] == "agreement" and is_complete and majority:
+            final_status = majority
+
+        item_rows.append(
+            {
+                "item_id": item_id,
+                "study_assignment": record["study_assignment"],
+                "expected_votes": int(record["expected_votes"]),
+                "assigned_slots": json.dumps(record["assigned_slots"]),
+                "available_slots": json.dumps(available_slots),
+                "missing_session_slots": json.dumps(missing_session_slots_for_item),
+                "unreviewed_slots": json.dumps(unreviewed_slots),
+                "vote_count": vote_count,
+                "ok_votes": vote_counts["ok"],
+                "not_ok_votes": vote_counts["not_ok"],
+                "uncertain_votes": vote_counts["uncertain"],
+                "is_complete": is_complete,
+                "is_unanimous": is_unanimous,
+                "has_majority": majority is not None,
+                "majority_status": majority or "",
+                "final_status": final_status or "",
+                "votes_json": json.dumps(votes, ensure_ascii=False),
+                "industry_slug": record.get("industry_slug"),
+                "report_name": record.get("report_name"),
+                "exchange": record.get("exchange"),
+                "ticker": record.get("ticker"),
+                "year": record.get("year"),
+                "page_index": record.get("page_index"),
+                "page_number": record.get("page_number"),
+                "table_index": record.get("table_index"),
+                "table_row_count": record.get("table_row_count"),
+                "table_col_count": record.get("table_col_count"),
+            }
+        )
+
+    agreement_rows = [
+        row for row in item_rows if row["study_assignment"] == "agreement"
+    ]
+    complete_agreement_rows = [row for row in agreement_rows if row["is_complete"]]
+    agreement_rows_with_2plus_votes = [
+        row for row in agreement_rows if int(row["vote_count"]) >= 2
+    ]
+    unanimous_rows = [row for row in complete_agreement_rows if row["is_unanimous"]]
+    majority_rows = [
+        row
+        for row in complete_agreement_rows
+        if row["has_majority"] and not row["is_unanimous"]
+    ]
+    no_majority_rows = [
+        row for row in complete_agreement_rows if not row["has_majority"]
+    ]
+
+    final_status_counts = Counter(
+        row["final_status"] for row in item_rows if row["final_status"]
+    )
+    agreement_final_counts = Counter(
+        row["majority_status"]
+        for row in complete_agreement_rows
+        if row["majority_status"]
+    )
+
+    summary = {
+        "analysis_completed_at_utc": utc_now(),
+        "study_bundle_path": str(bundle_path.resolve()),
+        "sessions_dir": str(sessions_dir.resolve()),
+        "bundle": {
+            "annotator_count": bundle.get("annotator_count"),
+            "required_votes": bundle.get("required_votes"),
+            "summary": bundle.get("summary") or {},
+        },
+        "session_coverage": {
+            "expected_slots": expected_slots,
+            "sessions_found": len(selected_sessions),
+            "missing_slots": missing_slots,
+        },
+        "annotation_votes": {
+            "all": status_ratio_block(vote_level_counts_all),
+            "single": status_ratio_block(vote_level_counts_single),
+            "agreement": status_ratio_block(vote_level_counts_agreement),
+        },
+        "agreement": {
+            "tables_total": len(agreement_rows),
+            "tables_with_any_vote": sum(
+                1 for row in agreement_rows if row["vote_count"] > 0
+            ),
+            "tables_with_2plus_votes": len(agreement_rows_with_2plus_votes),
+            "tables_complete": len(complete_agreement_rows),
+            "unanimous_tables": len(unanimous_rows),
+            "mixed_majority_tables": len(majority_rows),
+            "no_majority_tables": len(no_majority_rows),
+            "exact_agreement_rate": safe_div(
+                len(unanimous_rows), len(complete_agreement_rows)
+            ),
+            "complete_pairwise": compute_pairwise_agreement(complete_agreement_rows),
+            "partial_pairwise": compute_pairwise_agreement(
+                agreement_rows_with_2plus_votes
+            ),
+            "fleiss_kappa": compute_fleiss_kappa(complete_agreement_rows),
+            "majority_status_counts": dict(agreement_final_counts),
+            "majority_status_ratios": status_ratio_block(agreement_final_counts),
+        },
+        "final_table_decisions": {
+            "tables_with_final_status": sum(final_status_counts.values()),
+            "status_counts": dict(final_status_counts),
+            "status_ratios": status_ratio_block(final_status_counts),
+        },
+        "warnings": warnings,
+    }
+    return summary, session_rows, item_rows
+
+
+def write_csv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    with tmp.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames, extrasaction="ignore")
+        writer.writeheader()
+        writer.writerows(rows)
+    tmp.replace(path)
+
+
+def render_summary_markdown(
+    *, summary: dict[str, Any], session_rows: list[dict[str, Any]]
+) -> str:
+    session_coverage = summary["session_coverage"]
+    all_votes = summary["annotation_votes"]["all"]
+    agreement = summary["agreement"]
+    final_tables = summary["final_table_decisions"]
+    warnings = summary.get("warnings") or []
+
+    lines = [
+        "# OCR Table Study Agreement Summary",
+        "",
+        f"- Generated: {summary['analysis_completed_at_utc']}",
+        f"- Study bundle: {summary['study_bundle_path']}",
+        f"- Sessions directory: {summary['sessions_dir']}",
+        f"- Sessions found: {session_coverage['sessions_found']} / {len(session_coverage['expected_slots'])}",
+        f"- Missing slots: {', '.join(str(slot) for slot in session_coverage['missing_slots']) or 'none'}",
+        "",
+        "## Vote-Level Ratios",
+        "",
+        f"- Reviewed votes: {all_votes['reviewed']}",
+        f"- Accept rate among all reviewed votes: {format_ratio(all_votes['ok_rate_all'])}",
+        f"- Reject rate among all reviewed votes: {format_ratio(all_votes['not_ok_rate_all'])}",
+        f"- Uncertain rate among all reviewed votes: {format_ratio(all_votes['uncertain_rate_all'])}",
+        f"- Accept ratio among decided votes: {format_ratio(all_votes['accept_ratio_decided'])}",
+        f"- Reject ratio among decided votes: {format_ratio(all_votes['reject_ratio_decided'])}",
+        "",
+        "## Agreement Subset",
+        "",
+        f"- Agreement tables total: {agreement['tables_total']}",
+        f"- Agreement tables with 2+ votes: {agreement['tables_with_2plus_votes']}",
+        f"- Agreement tables complete: {agreement['tables_complete']}",
+        f"- Exact agreement rate: {format_ratio(agreement['exact_agreement_rate'])}",
+        f"- Complete pairwise agreement: {format_ratio(agreement['complete_pairwise']['agreement_rate'])}",
+        f"- Partial pairwise agreement: {format_ratio(agreement['partial_pairwise']['agreement_rate'])}",
+        f"- Fleiss' kappa: {agreement['fleiss_kappa']:.4f}"
+        if agreement["fleiss_kappa"] is not None
+        else "- Fleiss' kappa: n/a",
+        f"- Unanimous tables: {agreement['unanimous_tables']}",
+        f"- Mixed-majority tables: {agreement['mixed_majority_tables']}",
+        f"- No-majority tables: {agreement['no_majority_tables']}",
+        "",
+        "## Final Table Decisions",
+        "",
+        f"- Tables with a final status: {final_tables['tables_with_final_status']}",
+        f"- Accept rate at table level: {format_ratio(final_tables['status_ratios']['ok_rate_all'])}",
+        f"- Reject rate at table level: {format_ratio(final_tables['status_ratios']['not_ok_rate_all'])}",
+        f"- Uncertain rate at table level: {format_ratio(final_tables['status_ratios']['uncertain_rate_all'])}",
+        f"- Accept ratio among decided tables: {format_ratio(final_tables['status_ratios']['accept_ratio_decided'])}",
+        f"- Reject ratio among decided tables: {format_ratio(final_tables['status_ratios']['reject_ratio_decided'])}",
+        "",
+        "## Session Breakdown",
+        "",
+        "| Slot | Session ID | Annotator | Reviewed | OK | Not OK | Uncertain | Accept Ratio | Reject Ratio |",
+        "| ---: | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |",
+    ]
+
+    for row in sorted(session_rows, key=lambda item: item["slot"]):
+        lines.append(
+            "| "
+            + " | ".join(
+                [
+                    str(row["slot"]),
+                    row["session_id"],
+                    row["annotator"],
+                    str(row["reviewed_count"]),
+                    str(row["ok"]),
+                    str(row["not_ok"]),
+                    str(row["uncertain"]),
+                    format_ratio(row["accept_ratio_decided"]),
+                    format_ratio(row["reject_ratio_decided"]),
+                ]
+            )
+            + " |"
+        )
+
+    if warnings:
+        lines.extend(["", "## Warnings", ""])
+        for warning in warnings:
+            lines.append(f"- {warning}")
+
+    return "\n".join(lines) + "\n"
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_arg_parser().parse_args(argv)
+    output_dir = args.output_dir or (DEFAULT_ANALYSIS_ROOT / args.study_bundle.stem)
+
+    summary, session_rows, item_rows = build_analysis(
+        bundle_path=args.study_bundle,
+        sessions_dir=args.sessions_dir,
+        session_ids=args.session_ids,
+        strict_manifest=args.strict_manifest,
+    )
+
+    atomic_write_json(output_dir / "summary.json", summary)
+    atomic_write_text(
+        output_dir / "summary.md",
+        render_summary_markdown(summary=summary, session_rows=session_rows),
+    )
+    write_csv(
+        output_dir / "session_metrics.csv",
+        session_rows,
+        fieldnames=[
+            "slot",
+            "session_id",
+            "session_name",
+            "annotator",
+            "item_count",
+            "reviewed_count",
+            "unreviewed_count",
+            "ok",
+            "not_ok",
+            "uncertain",
+            "accept_ratio_decided",
+            "reject_ratio_decided",
+            "uncertain_rate_all",
+            "updated_at_utc",
+        ],
+    )
+    write_csv(
+        output_dir / "item_metrics.csv",
+        item_rows,
+        fieldnames=[
+            "item_id",
+            "study_assignment",
+            "expected_votes",
+            "assigned_slots",
+            "available_slots",
+            "missing_session_slots",
+            "unreviewed_slots",
+            "vote_count",
+            "ok_votes",
+            "not_ok_votes",
+            "uncertain_votes",
+            "is_complete",
+            "is_unanimous",
+            "has_majority",
+            "majority_status",
+            "final_status",
+            "votes_json",
+            "industry_slug",
+            "report_name",
+            "exchange",
+            "ticker",
+            "year",
+            "page_index",
+            "page_number",
+            "table_index",
+            "table_row_count",
+            "table_col_count",
+        ],
+    )
+
+    print(f"Wrote study analysis to {output_dir}")
+    print(
+        json.dumps(
+            {
+                "sessions_found": summary["session_coverage"]["sessions_found"],
+                "agreement_tables_complete": summary["agreement"]["tables_complete"],
+                "exact_agreement_rate": summary["agreement"]["exact_agreement_rate"],
+                "final_table_accept_ratio": summary["final_table_decisions"][
+                    "status_ratios"
+                ]["accept_ratio_decided"],
+                "final_table_reject_ratio": summary["final_table_decisions"][
+                    "status_ratios"
+                ]["reject_ratio_decided"],
+                "warnings": len(summary.get("warnings") or []),
+            },
+            indent=2,
+        )
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/annotation_OCR/study_sessions.py b/annotation_OCR/study_sessions.py
new file mode 100644
index 0000000..731c5af
--- /dev/null
+++ b/annotation_OCR/study_sessions.py
@@ -0,0 +1,277 @@
+"""Build balanced table-study session bundles from a base table manifest."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import random
+from pathlib import Path
+from typing import Any
+
+
+HERE = Path(__file__).resolve().parent
+DEFAULT_SOURCE_MANIFEST = HERE / "manifests" / "tables_5000.json"
+DEFAULT_OUTPUT_DIR = HERE / "manifests"
+
+DEFAULT_TOTAL_TABLES = 1200
+DEFAULT_MIN_SESSION_ITEMS = 100
+DEFAULT_MAX_SESSION_ITEMS = 140
+DEFAULT_REQUIRED_VOTES = 3
+DEFAULT_OVERLAP_BY_ANNOTATORS = {
+    13: 250,
+    14: 300,
+    15: 300,
+    16: 300,
+    17: 300,
+}
+
+
+def load_manifest_items(path: Path) -> list[dict[str, Any]]:
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    items = payload.get("items")
+    if not isinstance(items, list):
+        raise ValueError(f"invalid manifest items in {path}")
+    return items
+
+
+def balanced_counts(total: int, buckets: int) -> list[int]:
+    base, remainder = divmod(total, buckets)
+    return [base + (1 if index < remainder else 0) for index in range(buckets)]
+
+
+def pick_study_tables(
+    items: list[dict[str, Any]], *, total_tables: int, seed: int
+) -> list[dict[str, Any]]:
+    if total_tables > len(items):
+        raise ValueError(
+            f"requested {total_tables} tables from manifest with only {len(items)} items"
+        )
+    rng = random.Random(seed)
+    selected = rng.sample(items, total_tables)
+    rng.shuffle(selected)
+    return selected
+
+
+def choose_overlap_sessions(
+    *, overlap_items: list[dict[str, Any]], overlap_counts: list[int], seed: int
+) -> list[list[dict[str, Any]]]:
+    rng = random.Random(seed)
+    remaining = overlap_counts[:]
+    assignments: list[list[dict[str, Any]]] = [[] for _ in overlap_counts]
+
+    for item in overlap_items:
+        eligible = [index for index, count in enumerate(remaining) if count > 0]
+        if len(eligible) < DEFAULT_REQUIRED_VOTES:
+            raise ValueError(
+                "not enough session capacity left for agreement assignment"
+            )
+        rng.shuffle(eligible)
+        eligible.sort(key=lambda index: remaining[index], reverse=True)
+        chosen = eligible[:DEFAULT_REQUIRED_VOTES]
+        for session_index in chosen:
+            assignments[session_index].append(item)
+            remaining[session_index] -= 1
+
+    if any(value != 0 for value in remaining):
+        raise ValueError("failed to exhaust overlap assignment capacities")
+
+    return assignments
+
+
+def build_session_items(
+    *,
+    selected_items: list[dict[str, Any]],
+    annotator_count: int,
+    overlap_tables: int,
+    seed: int,
+    min_session_items: int,
+    max_session_items: int,
+) -> dict[str, Any]:
+    total_tables = len(selected_items)
+    if overlap_tables > total_tables:
+        raise ValueError("overlap table count cannot exceed selected tables")
+
+    total_annotations = total_tables + (DEFAULT_REQUIRED_VOTES - 1) * overlap_tables
+    session_sizes = balanced_counts(total_annotations, annotator_count)
+    if any(
+        size < min_session_items or size > max_session_items for size in session_sizes
+    ):
+        raise ValueError(
+            f"cannot distribute {total_annotations} annotations across {annotator_count} sessions "
+            f"inside [{min_session_items}, {max_session_items}]"
+        )
+
+    overlap_items = selected_items[:overlap_tables]
+    unique_items = selected_items[overlap_tables:]
+    overlap_counts = balanced_counts(
+        overlap_tables * DEFAULT_REQUIRED_VOTES, annotator_count
+    )
+    overlap_assignments = choose_overlap_sessions(
+        overlap_items=overlap_items,
+        overlap_counts=overlap_counts,
+        seed=seed + annotator_count,
+    )
+
+    unique_counts = [
+        session_sizes[index] - len(overlap_assignments[index])
+        for index in range(annotator_count)
+    ]
+    if sum(unique_counts) != len(unique_items):
+        raise ValueError("unique assignment counts do not match remaining tables")
+
+    rng = random.Random(seed + 1000 + annotator_count)
+    unique_pool = list(unique_items)
+    rng.shuffle(unique_pool)
+
+    sessions: list[dict[str, Any]] = []
+    cursor = 0
+    for session_index in range(annotator_count):
+        agreement_records = [
+            {
+                **dict(item),
+                "study_assignment": "agreement",
+                "study_expected_votes": DEFAULT_REQUIRED_VOTES,
+                "study_session_slot": session_index + 1,
+            }
+            for item in overlap_assignments[session_index]
+        ]
+        unique_records = []
+        for _ in range(unique_counts[session_index]):
+            item = unique_pool[cursor]
+            cursor += 1
+            unique_records.append(
+                {
+                    **dict(item),
+                    "study_assignment": "single",
+                    "study_expected_votes": 1,
+                    "study_session_slot": session_index + 1,
+                }
+            )
+
+        manifest_items = agreement_records + unique_records
+        rng.shuffle(manifest_items)
+        sessions.append(
+            {
+                "slot": session_index + 1,
+                "target_items": len(manifest_items),
+                "agreement_items": len(agreement_records),
+                "single_items": len(unique_records),
+                "items": manifest_items,
+            }
+        )
+
+    return {
+        "annotator_count": annotator_count,
+        "session_item_counts": [session["target_items"] for session in sessions],
+        "overlap_tables": overlap_tables,
+        "unique_tables": total_tables,
+        "total_annotations": total_annotations,
+        "sessions": sessions,
+    }
+
+
+def build_study_bundle(
+    *,
+    source_manifest_path: Path,
+    annotator_count: int,
+    overlap_tables: int,
+    total_tables: int,
+    seed: int,
+    min_session_items: int,
+    max_session_items: int,
+) -> dict[str, Any]:
+    items = load_manifest_items(source_manifest_path)
+    selected = pick_study_tables(items, total_tables=total_tables, seed=seed)
+    session_payload = build_session_items(
+        selected_items=selected,
+        annotator_count=annotator_count,
+        overlap_tables=overlap_tables,
+        seed=seed,
+        min_session_items=min_session_items,
+        max_session_items=max_session_items,
+    )
+    return {
+        "bundle_type": "ocr_table_study_bundle",
+        "source_manifest_path": str(source_manifest_path),
+        "seed": seed,
+        "annotator_count": annotator_count,
+        "required_votes": DEFAULT_REQUIRED_VOTES,
+        "min_session_items": min_session_items,
+        "max_session_items": max_session_items,
+        "summary": {
+            "annotator_count": annotator_count,
+            "unique_tables": session_payload["unique_tables"],
+            "agreement_tables": session_payload["overlap_tables"],
+            "total_annotations": session_payload["total_annotations"],
+            "session_item_counts": session_payload["session_item_counts"],
+        },
+        "sessions": session_payload["sessions"],
+    }
+
+
+def write_bundle(path: Path, payload: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    tmp.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    tmp.replace(path)
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Build OCR table-study session bundles."
+    )
+    parser.add_argument("--source-manifest", type=Path, default=DEFAULT_SOURCE_MANIFEST)
+    parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--total-tables", type=int, default=DEFAULT_TOTAL_TABLES)
+    parser.add_argument(
+        "--min-session-items", type=int, default=DEFAULT_MIN_SESSION_ITEMS
+    )
+    parser.add_argument(
+        "--max-session-items", type=int, default=DEFAULT_MAX_SESSION_ITEMS
+    )
+    parser.add_argument(
+        "--annotators",
+        type=int,
+        nargs="+",
+        default=sorted(DEFAULT_OVERLAP_BY_ANNOTATORS),
+        help="Annotator counts to build bundles for, e.g. --annotators 14 15 16",
+    )
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_arg_parser().parse_args(argv)
+    for annotator_count in args.annotators:
+        if annotator_count not in DEFAULT_OVERLAP_BY_ANNOTATORS:
+            raise ValueError(
+                f"no default overlap setting for annotator count {annotator_count}"
+            )
+        overlap_tables = DEFAULT_OVERLAP_BY_ANNOTATORS[annotator_count]
+        bundle = build_study_bundle(
+            source_manifest_path=args.source_manifest,
+            annotator_count=annotator_count,
+            overlap_tables=overlap_tables,
+            total_tables=args.total_tables,
+            seed=args.seed,
+            min_session_items=args.min_session_items,
+            max_session_items=args.max_session_items,
+        )
+        output_path = args.output_dir / f"study_sessions_{annotator_count}.json"
+        write_bundle(output_path, bundle)
+        print(
+            json.dumps(
+                {
+                    "annotator_count": annotator_count,
+                    "overlap_tables": overlap_tables,
+                    "output": str(output_path),
+                    **bundle["summary"],
+                },
+                indent=2,
+            )
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 48a4623eb67eed62168612757f098277209d4d36 Mon Sep 17 00:00:00 2001
From: Charles Moslonka <charles.moslonka@artefact.com>
Date: Thu, 28 May 2026 15:00:29 +0200
Subject: [PATCH 8/8] ENH: fix the dollar sign in DeepSeekOCR

---
 scripts/fix_broken_dollar_overlap.py | 216 +++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 scripts/fix_broken_dollar_overlap.py

diff --git a/scripts/fix_broken_dollar_overlap.py b/scripts/fix_broken_dollar_overlap.py
new file mode 100644
index 0000000..f3ae8b4
--- /dev/null
+++ b/scripts/fix_broken_dollar_overlap.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+r"""Replace broken dollar markers in .mmd files using heuristic-based selection.
+
+Heuristic A (pair-based):
+- Adjacent marker pair "\\(" then "\\)" with no curly braces between them.
+
+Heuristic B (money-context):
+- Marker appears to precede an amount-like token or nearby money phrasing.
+- Excludes obvious math-like markup such as "\\( _{2}" and "\\( ^{TM}".
+
+Selection strategies:
+- money: use only money-context markers (higher recall; default).
+- overlap: use intersection of pair-based and money-context markers (higher precision).
+
+Always-on exact rule:
+- Replace exact table cell markers "<td>\(</td>" and "<td>\)</td>".
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+from pathlib import Path
+from typing import Iterable
+
+MARKER_RE = re.compile(r"\\\(|\\\)")
+MONEY_NUM_RE = re.compile(r"^\s*[\(\-]?\d(?:[\d,]*\.?\d*)")
+MONEY_WORD_RE = re.compile(r"^.{0,24}\b(?:million|billion|thousand)\b", re.IGNORECASE)
+MONEY_PHRASE_RE = re.compile(
+    r"^.{0,30}\b(?:per\s+share|per\s+ton|per\s+gallon|per\s+bushel|market\s+value)\b",
+    re.IGNORECASE,
+)
+MATHISH_RE = re.compile(r"^\s*[_\^]?\s*\{")
+EXACT_TD_RE = re.compile(r"<td>(\\\(|\\\))</td>")
+
+
+def iter_mmd_files(root: Path) -> Iterable[Path]:
+    for path in root.rglob("*.mmd"):
+        if path.is_file():
+            yield path
+
+
+def get_markers(text: str) -> list[tuple[int, str]]:
+    return [(m.start(), m.group(0)) for m in MARKER_RE.finditer(text)]
+
+
+def select_user_markers(text: str, markers: list[tuple[int, str]]) -> set[int]:
+    selected: set[int] = set()
+    for i in range(len(markers) - 1):
+        pos_a, tok_a = markers[i]
+        pos_b, tok_b = markers[i + 1]
+        if tok_a != r"\(" or tok_b != r"\)":
+            continue
+        between = text[pos_a + 2 : pos_b]
+        if "{" in between or "}" in between:
+            continue
+        selected.add(pos_a)
+        selected.add(pos_b)
+    return selected
+
+
+def select_money_context_markers(text: str, markers: list[tuple[int, str]]) -> set[int]:
+    selected: set[int] = set()
+    for pos, _tok in markers:
+        after = text[pos + 2 : pos + 66]
+
+        # Exclude obvious math-like constructions: \( _{...}, \( ^{...}, \({ ...
+        if MATHISH_RE.match(after):
+            continue
+
+        is_money = bool(
+            MONEY_NUM_RE.match(after)
+            or MONEY_WORD_RE.match(after)
+            or MONEY_PHRASE_RE.match(after)
+        )
+        if is_money:
+            selected.add(pos)
+    return selected
+
+
+def select_exact_td_markers(text: str) -> set[int]:
+    # Capture the marker token position inside exact HTML cells like <td>\(</td>.
+    return {m.start(1) for m in EXACT_TD_RE.finditer(text)}
+
+
+def apply_replacements(
+    text: str, markers: list[tuple[int, str]], positions: set[int]
+) -> tuple[str, int]:
+    if not positions:
+        return text, 0
+
+    out: list[str] = []
+    cursor = 0
+    replaced = 0
+
+    for pos, _tok in markers:
+        if pos in positions:
+            out.append(text[cursor:pos])
+            out.append("$")
+            cursor = pos + 2
+            replaced += 1
+
+    out.append(text[cursor:])
+    return "".join(out), replaced
+
+
+def process_file(path: Path, dry_run: bool, strategy: str) -> dict[str, int]:
+    text = path.read_text(encoding="utf-8")
+    markers = get_markers(text)
+
+    user_positions = select_user_markers(text, markers)
+    money_positions = select_money_context_markers(text, markers)
+    overlap = user_positions & money_positions
+    td_exact_positions = select_exact_td_markers(text)
+
+    if strategy == "money":
+        selected_positions = money_positions | td_exact_positions
+    elif strategy == "overlap":
+        selected_positions = overlap | td_exact_positions
+    else:
+        raise ValueError(f"Unknown strategy: {strategy}")
+
+    updated_text, replaced = apply_replacements(text, markers, selected_positions)
+
+    changed = int(replaced > 0)
+    if replaced > 0 and not dry_run:
+        path.write_text(updated_text, encoding="utf-8")
+
+    return {
+        "markers": len(markers),
+        "user": len(user_positions),
+        "money": len(money_positions),
+        "overlap": len(overlap),
+        "td_exact": len(td_exact_positions),
+        "replaced": replaced,
+        "changed": changed,
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Replace broken dollar markers in .mmd files using heuristic-based "
+            "selection."
+        )
+    )
+    parser.add_argument(
+        "directory", type=Path, help="Root directory to scan recursively"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Compute and report changes without writing files",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print per-file replacement counts",
+    )
+    parser.add_argument(
+        "--strategy",
+        choices=("money", "overlap"),
+        default="money",
+        help=(
+            "Replacement selection strategy: 'money' (higher recall, default) "
+            "or 'overlap' (higher precision)."
+        ),
+    )
+    args = parser.parse_args()
+
+    root = args.directory
+    if not root.exists() or not root.is_dir():
+        raise SystemExit(f"Directory not found or not a directory: {root}")
+
+    totals = {
+        "files": 0,
+        "markers": 0,
+        "user": 0,
+        "money": 0,
+        "overlap": 0,
+        "td_exact": 0,
+        "replaced": 0,
+        "changed": 0,
+    }
+
+    for path in iter_mmd_files(root):
+        stats = process_file(path, dry_run=args.dry_run, strategy=args.strategy)
+        totals["files"] += 1
+        totals["markers"] += stats["markers"]
+        totals["user"] += stats["user"]
+        totals["money"] += stats["money"]
+        totals["overlap"] += stats["overlap"]
+        totals["td_exact"] += stats["td_exact"]
+        totals["replaced"] += stats["replaced"]
+        totals["changed"] += stats["changed"]
+
+        if args.verbose and stats["replaced"] > 0:
+            print(f"{path}: replacements={stats['replaced']}")
+
+    mode = "DRY RUN" if args.dry_run else "APPLY"
+    print(f"MODE={mode}")
+    print(f"STRATEGY={args.strategy}")
+    print(f"FILES_SCANNED={totals['files']}")
+    print(f"TOTAL_MARKER_TOKENS={totals['markers']}")
+    print(f"USER_HEURISTIC_TOTAL={totals['user']}")
+    print(f"MONEY_HEURISTIC_TOTAL={totals['money']}")
+    print(f"OVERLAP_TOTAL={totals['overlap']}")
+    print(f"EXACT_TD_TOTAL={totals['td_exact']}")
+    print(f"REPLACEMENTS={totals['replaced']}")
+    print(f"FILES_CHANGED={totals['changed']}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())