diff --git a/.gitignore b/.gitignore
index 894a2bc..4db1aa8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,9 @@ doc_text_processing/CEO_word_extraction/cleaning_extractions/cleaned/
KPI_analysis/cache/
KPI_analysis/output/
+# OCR annotation artifacts
+annotation_OCR/sessions/
+
# VSCode settings
.vscode/settings.json
diff --git a/annotation_OCR/README.md b/annotation_OCR/README.md
new file mode 100644
index 0000000..ebded4e
--- /dev/null
+++ b/annotation_OCR/README.md
@@ -0,0 +1,250 @@
+# OCR Annotation Interface
+
+Browser interface for reviewing OCR table extraction quality. The app now
+defaults to table-level items extracted from `*_det.mmd`, shows the isolated
+HTML table in the extracted-content pane, and auto-centers the raw page image
+on the detected table region while still allowing manual zoom-out for more
+context.
+
+Annotations are stored under `annotation_OCR/sessions/` so quality labels can
+later be joined to downstream benchmark outputs.
+
+## Run
+
+### Headless mode (recommended for multi-user)
+
+Start the server with no session arguments — annotators create/resume sessions
+from the browser landing page. If `annotation_OCR/manifests/tables_5000.json`
+exists, the server uses it automatically for fast session creation. Otherwise
+it falls back to building a sampled table queue directly from the OCR corpus.
+
+```bash
+uv run python annotation_OCR/server.py --host 0.0.0.0 --port 5050
+```
+
+Then open `http://HOST:5050`. The landing page lets each user enter their name,
+create a new session, or resume an existing one. No CLI or Python knowledge
+needed on the annotator side.
+
+### Pre-created session (single-user / scripted)
+
+From the repository root:
+
+```bash
+uv run python annotation_OCR/server.py \
+ --session-name "table QA smoke" \
+ --annotator "your-name" \
+ --queue-mode tables \
+ --sample-size 100 \
+ --host 127.0.0.1 \
+ --port 5050
+```
+
+For a small smoke run:
+
+```bash
+uv run python annotation_OCR/server.py \
+ --session-name smoke \
+ --annotator test \
+ --queue-mode tables \
+ --sample-size 20 \
+ --limit-reports 2 \
+ --host 127.0.0.1 \
+ --port 5050
+```
+
+To force the server to use an explicit precomputed manifest:
+
+```bash
+uv run python annotation_OCR/server.py \
+ --manifest-path annotation_OCR/manifests/tables_5000.json \
+ --host 127.0.0.1 \
+ --port 5050
+```
+
+To use precomputed study-session bundles for a paper annotation round:
+
+```bash
+uv run python annotation_OCR/server.py \
+ --study-bundle annotation_OCR/manifests/study_sessions_15.json \
+ --host 127.0.0.1 \
+ --port 5050
+```
+
+Each new session created from the landing page then receives the next fixed
+session queue from that bundle, so the progress bar tracks a real per-annotator
+target rather than the whole table pool.
+
+Resume an existing session:
+
+```bash
+uv run python annotation_OCR/server.py --session-id SESSION_ID --host 127.0.0.1 --port 5050
+```
+
+SSH port forwarding from a laptop:
+
+```bash
+ssh -L 5050:127.0.0.1:5050 USER@SERVER
+```
+
+Then open `http://127.0.0.1:5050` locally.
+
+For table sessions, the extracted-content pane shows only the isolated table and
+the raw-image pane auto-refocuses on the detected bounding box. Use `Refocus`
+or press `F` to jump back to the table after manual exploration.
+
+## Precompute A Reusable 5,000-Table Manifest
+
+Build the reusable subset once offline:
+
+```bash
+mkdir -p annotation_OCR/manifests
+
+uv run python annotation_OCR/ocr_index.py \
+ --queue-mode tables \
+ --sample-size 5000 \
+ --seed 42 \
+ --output annotation_OCR/manifests/tables_5000.json
+```
+
+That manifest can then be reused by the server so new annotation sessions do
+not need to rescan the OCR corpus.
+
+## Build Study Session Bundles
+
+For hybrid annotation rounds, build one bundle for each possible annotator
+count. The generated bundles already keep each session inside the target range
+of 120 to 140 items:
+
+```bash
+uv run python annotation_OCR/study_sessions.py \
+ --source-manifest annotation_OCR/manifests/tables_5000.json \
+ --output-dir annotation_OCR/manifests \
+ --annotators 14 15 16 \
+ --seed 42
+```
+
+This writes:
+
+- `annotation_OCR/manifests/study_sessions_14.json`
+- `annotation_OCR/manifests/study_sessions_15.json`
+- `annotation_OCR/manifests/study_sessions_16.json`
+
+The 15- and 16-annotator bundles use 1500 unique tables with 300 triple-coded
+agreement tables. The 14-annotator bundle lowers the agreement subset to 220 so
+all session quotas still stay within the 120 to 140 target range.
+
+## Compute Agreement After Annotation
+
+After the study round, compute overlap agreement plus accept/reject ratios with:
+
+```bash
+uv run python annotation_OCR/study_agreement.py \
+ --study-bundle annotation_OCR/manifests/study_sessions_15.json
+```
+
+By default this writes analysis artifacts under:
+
+- `annotation_OCR/sessions/study_analysis/study_sessions_15/summary.md`
+- `annotation_OCR/sessions/study_analysis/study_sessions_15/summary.json`
+- `annotation_OCR/sessions/study_analysis/study_sessions_15/session_metrics.csv`
+- `annotation_OCR/sessions/study_analysis/study_sessions_15/item_metrics.csv`
+
+The script auto-discovers sessions created from that bundle via their stored
+`study_bundle_path` and `study_slot`. It reports exact agreement, pairwise
+agreement, Fleiss' kappa, and accept/reject ratios both at the raw vote level
+and at the final table-decision level.
+
+## Data Sources
+
+Defaults:
+
+- OCR Markdown root: `DeepSeekOCR_Ardian_pruned_1k/`
+- Raw image root: `/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs/`
+- Default reusable manifest path: `annotation_OCR/manifests/tables_5000.json`
+
+Each queued table item maps back to the raw PNG page with the same zero-based
+page index, for example page index `12` maps to `pages/page_0012.png`. Table
+items carry the `_det.mmd` bounding box used by the UI to center the preview.
+The manifest records mapping warnings such as missing raw images or page-count
+mismatches.
+
+## Queue Modes
+
+- `tables`: default. Queues table-level items from `*_det.mmd`. Use `--sample-size` for deterministic random sampling.
+- `table-candidates`: legacy page-level mode. Keeps pages with table-like signals, dense numeric rows, financial statement headings, or KPI aliases.
+- `all`: legacy page-level mode that queues every page.
+- `sample`: legacy seeded random sample across all discovered pages.
+
+Indexer smoke check:
+
+```bash
+uv run python annotation_OCR/ocr_index.py \
+ --ocr-root DeepSeekOCR_Ardian_pruned_1k \
+ --raw-root /data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs \
+ --queue-mode tables \
+ --sample-size 20 \
+ --limit-reports 2 \
+ --check
+```
+
+## Keyboard
+
+- `a`: mark Yes, save, advance
+- `r`: mark No, save, advance
+- `u`: mark Uncertain, save, advance
+- `j` / right arrow: next page
+- `k` / left arrow: previous page
+- `+`, `-`, `0`: zoom / reset
+- `f`: refocus on the detected table
+- `?`: shortcut dialog
+
+Shortcuts are disabled while typing in notes or editing form controls.
+
+## Outputs
+
+Each session writes to `annotation_OCR/sessions/{session_id}/`:
+
+- `metadata.json`: session name, annotator, configuration, counts, timestamps.
+- `manifest.json`: queued items and mapping diagnostics.
+- `annotations.jsonl`: append-only event log, one saved annotation per line.
+- `current_annotations.json`: latest annotation per item, written atomically.
+- `summary.csv`: one row per queued item, including unreviewed items.
+- `summary.md`: status-count overview.
+
+Regenerate summaries:
+
+```bash
+uv run python annotation_OCR/summarize.py --session-id SESSION_ID
+uv run python annotation_OCR/summarize.py --all
+```
+
+## Annotation Schema
+
+Primary fields:
+
+- `overall_status`: `ok`, `not_ok`, `uncertain`, or `unreviewed`
+- `notes`: optional free text
+
+Identity fields include `industry_slug`, `report_name`, `exchange`, `ticker`, `year`, `page_index`, `page_number`, `mmd_path`, `raw_png_path`, and `page_text_sha256`.
+
+For table sessions, summary rows also include `item_kind`, `table_index`,
+`table_row_count`, `table_col_count`, `det_mmd_path`, and `focus_bbox`.
+
+## Downstream Joins
+
+For table-level filtering, join annotation summaries on:
+
+```text
+exchange, ticker, year, page_index, table_index
+```
+
+For report-level benchmark filtering, aggregate page labels to:
+
+```text
+exchange, ticker, year
+```
+
+A conservative report-level rule is to exclude a report when any reviewed table
+item is `not_ok`, or when the share of `uncertain` table items exceeds a
+threshold chosen for the benchmark run.
\ No newline at end of file
diff --git a/annotation_OCR/__init__.py b/annotation_OCR/__init__.py
new file mode 100644
index 0000000..e045a18
--- /dev/null
+++ b/annotation_OCR/__init__.py
@@ -0,0 +1 @@
+"""OCR annotation interface package."""
diff --git a/annotation_OCR/manifests/README.md b/annotation_OCR/manifests/README.md
new file mode 100644
index 0000000..cba2bc8
--- /dev/null
+++ b/annotation_OCR/manifests/README.md
@@ -0,0 +1,46 @@
+# Table Manifests
+
+Place reusable sampled table manifests here.
+
+Recommended default:
+
+```bash
+uv run python annotation_OCR/ocr_index.py \
+ --queue-mode tables \
+ --sample-size 5000 \
+ --seed 42 \
+ --output annotation_OCR/manifests/tables_5000.json
+```
+
+When `tables_5000.json` exists, `annotation_OCR/server.py` will use it by default for new sessions.
+
+## Study Session Bundles
+
+For paper annotation rounds, also build the headcount-specific session bundles:
+
+```bash
+uv run python annotation_OCR/study_sessions.py \
+ --source-manifest annotation_OCR/manifests/tables_5000.json \
+ --output-dir annotation_OCR/manifests \
+ --annotators 14 15 16 \
+ --seed 42
+```
+
+This creates:
+
+- `study_sessions_14.json`
+- `study_sessions_15.json`
+- `study_sessions_16.json`
+
+Use the bundle matching the final annotator count when starting the server:
+
+```bash
+uv run python annotation_OCR/server.py \
+ --study-bundle annotation_OCR/manifests/study_sessions_15.json
+```
+
+Why the 14-annotator bundle differs:
+
+- `1500 unique + 300 triple-coded` requires `2100` total annotations.
+- That fits 15 or 16 annotators while keeping each session in the `120–140` range.
+- For 14 annotators, the bundle uses `220` agreement tables instead, for `1940` total annotations and per-session targets of `138–139`.
diff --git a/annotation_OCR/ocr_index.py b/annotation_OCR/ocr_index.py
new file mode 100644
index 0000000..fb7ac21
--- /dev/null
+++ b/annotation_OCR/ocr_index.py
@@ -0,0 +1,928 @@
+"""Build OCR annotation queues.
+
+The annotation UI can work either at page level from canonical ``.mmd`` files
+or at table level from ``*_det.mmd`` files that carry OCR coordinates.
+Page positions are preserved exactly: page index ``i`` in an ``.mmd`` split
+maps to ``pages/page_XXXX.png`` with the same zero-based index when the raw
+image exists.
+"""
+
+from __future__ import annotations
+
+import argparse
+import html
+import hashlib
+import json
+import random
+import re
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+
+HERE = Path(__file__).resolve().parent
+REPO_ROOT = HERE.parent
+
+DEFAULT_OCR_ROOT = REPO_ROOT / "DeepSeekOCR_Ardian_pruned_1k"
+DEFAULT_RAW_ROOT = Path(
+ "/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs"
+)
+
+PAGE_SPLIT_RE = re.compile(r"<---\s*Page Split\s*--->", re.IGNORECASE)
+REPORT_NAME_RE = re.compile(r"^([A-Z0-9-]+)_(.+)_(\d{4})(?:_[0-9a-fA-F]{8,})?$")
+HASH_SUFFIX_RE = re.compile(r"_[0-9a-fA-F]{8,}$")
+DET_HEADER_RE = re.compile(
+ r"(?m)^<\|ref\|>([^<]+)<\|/ref\|><\|det\|>(.*?)<\|/det\|>\s*$"
+)
+HTML_ROW_RE = re.compile(r"
]*>(.*?) ", re.IGNORECASE | re.DOTALL)
+HTML_CELL_RE = re.compile(r"]*>(.*?) ", re.IGNORECASE | re.DOTALL)
+HTML_TAG_RE = re.compile(r"<[^>]+>")
+
+CORE_KPI_ALIASES = {
+ "revenue": [
+ "net sales",
+ "total net sales",
+ "sales revenue",
+ "revenues",
+ "revenue",
+ "net revenue",
+ ],
+ "gross_profit": ["gross profit", "gross margin"],
+ "operating_income": [
+ "operating income",
+ "income from operations",
+ "operating profit",
+ ],
+ "net_income": [
+ "net income",
+ "net earnings",
+ "net loss",
+ "net income attributable",
+ ],
+ "total_assets": ["total assets"],
+ "total_liabilities": ["total liabilities", "liabilities"],
+ "cash_and_equivalents": [
+ "cash and cash equivalents",
+ "cash equivalents",
+ "cash, cash equivalents",
+ ],
+ "operating_cash_flow": [
+ "net cash provided by operating activities",
+ "cash flow from operating activities",
+ "operating cash flow",
+ ],
+ "capex": [
+ "capital expenditures",
+ "capital expenditure",
+ "additions to property, plant and equipment",
+ "purchase of property and equipment",
+ "additions of long-lived assets",
+ ],
+}
+
+FINANCIAL_TABLE_HEADINGS = [
+ "consolidated statement of operations",
+ "consolidated statements of operations",
+ "consolidated income statement",
+ "consolidated statements of income",
+ "consolidated balance sheet",
+ "consolidated balance sheets",
+ "consolidated cash flow statement",
+ "consolidated statements of cash flows",
+ "consolidated statement of cash flows",
+ "statements of comprehensive income",
+ "statement of financial position",
+ "notes to the consolidated financial statements",
+ "selected financial data",
+ "five year record",
+]
+
+NUMERIC_ROW_RE = re.compile(
+ r"(? dict[str, Any]:
+ record = asdict(self)
+ if not include_text:
+ record.pop("page_text", None)
+ return record
+
+
+def parse_report_name(name: str) -> tuple[str, str, int] | None:
+ match = REPORT_NAME_RE.match(name)
+ if not match:
+ return None
+ return match.group(1), match.group(2), int(match.group(3))
+
+
+def strip_hash_suffix(name: str) -> str:
+ return HASH_SUFFIX_RE.sub("", name)
+
+
+def report_base_name(name: str) -> str:
+ parsed = parse_report_name(name)
+ if parsed is None:
+ return strip_hash_suffix(name)
+ exchange, ticker, year = parsed
+ return f"{exchange}_{ticker}_{year}"
+
+
+def find_mmd(report_dir: Path) -> Path | None:
+ preferred = report_dir / f"{report_dir.name}.mmd"
+ if preferred.is_file():
+ return preferred
+
+ base_preferred = report_dir / f"{report_base_name(report_dir.name)}.mmd"
+ if base_preferred.is_file():
+ return base_preferred
+
+ candidates = sorted(
+ path for path in report_dir.glob("*.mmd") if not path.name.endswith("_det.mmd")
+ )
+ if candidates:
+ return candidates[0]
+
+ fallback = sorted(report_dir.glob("*.mmd"))
+ return fallback[0] if fallback else None
+
+
+def find_det_mmd(report_dir: Path) -> Path | None:
+ preferred = report_dir / f"{report_dir.name}_det.mmd"
+ if preferred.is_file():
+ return preferred
+
+ base_preferred = report_dir / f"{report_base_name(report_dir.name)}_det.mmd"
+ if base_preferred.is_file():
+ return base_preferred
+
+ candidates = sorted(report_dir.glob("*_det.mmd"))
+ return candidates[0] if candidates else None
+
+
+def discover_reports(root: Path) -> list[ReportInfo]:
+ reports: list[ReportInfo] = []
+ seen_dirs = sorted({mmd.parent for mmd in root.rglob("*.mmd")})
+ for report_dir in seen_dirs:
+ parsed = parse_report_name(report_dir.name)
+ if parsed is None:
+ continue
+ mmd_path = find_mmd(report_dir)
+ det_mmd_path = find_det_mmd(report_dir)
+ if mmd_path is None and det_mmd_path is None:
+ continue
+ exchange, ticker, year = parsed
+ industry_slug = report_dir.parent.name
+ reports.append(
+ ReportInfo(
+ industry_slug=industry_slug,
+ name=report_dir.name,
+ exchange=exchange,
+ ticker=ticker,
+ year=year,
+ report_dir=report_dir,
+ mmd_path=mmd_path or det_mmd_path,
+ det_mmd_path=det_mmd_path,
+ )
+ )
+ return reports
+
+
+def split_pages(raw: str) -> list[str]:
+ pages = [page.strip() for page in PAGE_SPLIT_RE.split(raw)]
+ if pages and not pages[-1]:
+ pages.pop()
+ return pages
+
+
+def load_pages(mmd_path: Path) -> list[str]:
+ raw = mmd_path.read_text(encoding="utf-8", errors="replace")
+ return split_pages(raw)
+
+
+def parse_bboxes(raw: str) -> list[list[int]]:
+ coords = [int(value) for value in re.findall(r"-?\d+", raw)]
+ boxes: list[list[int]] = []
+ for index in range(0, len(coords), 4):
+ chunk = coords[index : index + 4]
+ if len(chunk) == 4:
+ boxes.append(chunk)
+ return boxes
+
+
+def parse_det_blocks(page_text: str) -> list[DetBlock]:
+ matches = list(DET_HEADER_RE.finditer(page_text))
+ if not matches:
+ return []
+
+ blocks: list[DetBlock] = []
+ for index, match in enumerate(matches):
+ payload_start = match.end()
+ payload_end = (
+ matches[index + 1].start() if index + 1 < len(matches) else len(page_text)
+ )
+ payload = page_text[payload_start:payload_end].strip()
+ bbox_raw = match.group(2).strip()
+ blocks.append(
+ DetBlock(
+ ref_type=match.group(1).strip().lower(),
+ bbox_raw=bbox_raw,
+ bboxes=parse_bboxes(bbox_raw),
+ payload=payload,
+ )
+ )
+ return blocks
+
+
+def strip_html(value: str) -> str:
+ text = HTML_TAG_RE.sub(" ", value)
+ return " ".join(html.unescape(text).split())
+
+
+def table_dimensions(table_html: str) -> tuple[int, int]:
+ row_count = 0
+ col_count = 0
+ for row_html in HTML_ROW_RE.findall(table_html):
+ row_count += 1
+ col_count = max(col_count, len(HTML_CELL_RE.findall(row_html)))
+ return row_count, col_count
+
+
+def combined_bbox(bboxes: list[list[int]]) -> list[int] | None:
+ if not bboxes:
+ return None
+ return [
+ min(box[0] for box in bboxes),
+ min(box[1] for box in bboxes),
+ max(box[2] for box in bboxes),
+ max(box[3] for box in bboxes),
+ ]
+
+
+def nearby_context(blocks: list[DetBlock], block_index: int, *, direction: int) -> str:
+ collected: list[str] = []
+ index = block_index + direction
+ while 0 <= index < len(blocks) and len(collected) < 2:
+ block = blocks[index]
+ if block.ref_type in {"text", "title", "sub_title"} and block.payload:
+ collected.append(strip_html(block.payload))
+ index += direction
+ if direction < 0:
+ collected.reverse()
+ return "\n".join(value for value in collected if value)
+
+
+def detect_table_reasons(
+ table_html: str, context_before: str, context_after: str
+) -> list[str]:
+ reasons = ["det-table"]
+ seen = set(reasons)
+ for reason in detect_candidate_reasons(
+ "\n".join(part for part in [context_before, table_html, context_after] if part)
+ ):
+ if reason not in seen:
+ seen.add(reason)
+ reasons.append(reason)
+ return reasons
+
+
+def resolve_raw_dir(report: ReportInfo, raw_root: Path) -> tuple[Path | None, str]:
+ industry_root = raw_root / report.industry_slug
+ if not industry_root.is_dir():
+ return None, "raw-industry-missing"
+
+ exact = industry_root / report.name
+ if exact.is_dir():
+ return exact, "ok-exact"
+
+ base_name = report_base_name(report.name)
+ stripped = industry_root / base_name
+ if stripped.is_dir():
+ return stripped, "ok-hash-stripped"
+
+ matches = sorted(
+ path for path in industry_root.glob(f"{base_name}*") if path.is_dir()
+ )
+ if len(matches) == 1:
+ return matches[0], "ok-glob"
+ if len(matches) > 1:
+ return None, "raw-dir-ambiguous"
+ return None, "raw-dir-missing"
+
+
+def list_page_pngs(raw_dir: Path | None) -> list[Path]:
+ if raw_dir is None:
+ return []
+ pages_dir = raw_dir / "pages"
+ if not pages_dir.is_dir():
+ return []
+ return sorted(p for p in pages_dir.glob("page_*.png") if p.is_file())
+
+
+def resolve_table_source(report: ReportInfo, raw_root: Path) -> TableSourceInfo | None:
+ raw_dir, raw_status = resolve_raw_dir(report, raw_root)
+ if raw_dir is not None:
+ raw_det_mmd = find_det_mmd(raw_dir)
+ raw_mmd = find_mmd(raw_dir)
+ raw_page_pngs = list_page_pngs(raw_dir)
+ if raw_det_mmd is not None and raw_page_pngs:
+ return TableSourceInfo(
+ report_dir=raw_dir,
+ mmd_path=raw_mmd or raw_det_mmd,
+ det_mmd_path=raw_det_mmd,
+ page_pngs=raw_page_pngs,
+ mapping_status=raw_status,
+ )
+
+ local_det_mmd = report.det_mmd_path
+ if local_det_mmd is None:
+ return None
+
+ fallback_page_pngs = list_page_pngs(raw_dir)
+ source_warning = None
+ if raw_dir is not None:
+ source_warning = "table-source-fallback-pruned-det"
+ mapping_status = raw_status
+ else:
+ source_warning = "table-source-no-raw-match"
+ mapping_status = "raw-dir-missing"
+
+ return TableSourceInfo(
+ report_dir=report.report_dir,
+ mmd_path=report.mmd_path,
+ det_mmd_path=local_det_mmd,
+ page_pngs=fallback_page_pngs,
+ mapping_status=mapping_status,
+ source_warning=source_warning,
+ )
+
+
+def page_png_for(page_pngs: list[Path], page_index: int) -> Path | None:
+ expected_name = f"page_{page_index:04d}.png"
+ for path in page_pngs:
+ if path.name == expected_name:
+ return path
+ if 0 <= page_index < len(page_pngs):
+ return page_pngs[page_index]
+ return None
+
+
+def has_markdown_table(lines: list[str]) -> bool:
+ if any(MARKDOWN_TABLE_SEPARATOR_RE.match(line) for line in lines):
+ return True
+ pipe_rows = sum(1 for line in lines if line.count("|") >= 2)
+ return pipe_rows >= 2
+
+
+def dense_numeric_row_count(lines: list[str]) -> int:
+ return sum(1 for line in lines if len(NUMERIC_ROW_RE.findall(line)) >= 3)
+
+
+def detect_candidate_reasons(text: str) -> list[str]:
+ lowered = text.lower()
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
+ reasons: list[str] = []
+
+ if has_markdown_table(lines):
+ reasons.append("markdown-table")
+ if "" in lowered or "" in lowered:
+ reasons.append("html-table")
+
+ numeric_rows = dense_numeric_row_count(lines)
+ if numeric_rows >= 3:
+ reasons.append("dense-numeric-rows")
+
+ if any(heading in lowered for heading in FINANCIAL_TABLE_HEADINGS):
+ reasons.append("financial-heading")
+
+ aliases = sorted({alias for vals in CORE_KPI_ALIASES.values() for alias in vals})
+ alias_hits = [alias for alias in aliases if alias in lowered]
+ if len(alias_hits) >= 2:
+ reasons.append("kpi-aliases")
+
+ return reasons
+
+
+def text_preview(text: str, max_chars: int = 500) -> str:
+ compact = " ".join(text.split())
+ if len(compact) <= max_chars:
+ return compact
+ return compact[: max_chars - 1].rstrip() + "..."
+
+
+def page_text_hash(text: str) -> str:
+ return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
+
+
+def make_mapping_warnings(
+ *,
+ raw_dir: Path | None,
+ page_pngs: list[Path],
+ page_index: int,
+ mmd_page_count: int,
+ extra_warnings: list[str] | None = None,
+) -> list[str]:
+ warnings: list[str] = []
+ if raw_dir is None:
+ warnings.append("raw-directory-missing")
+ elif not (raw_dir / "pages").is_dir():
+ warnings.append("raw-pages-directory-missing")
+ if len(page_pngs) != mmd_page_count:
+ warnings.append("page-count-mismatch")
+ if page_png_for(page_pngs, page_index) is None:
+ warnings.append("raw-page-image-missing")
+ if extra_warnings:
+ warnings.extend(extra_warnings)
+ return warnings
+
+
+def build_all_items(
+ *,
+ ocr_root: Path,
+ raw_root: Path,
+ limit_reports: int | None = None,
+) -> list[PageItem]:
+ return list(
+ iter_page_items(
+ ocr_root=ocr_root,
+ raw_root=raw_root,
+ limit_reports=limit_reports,
+ )
+ )
+
+
+def iter_page_items(
+ *,
+ ocr_root: Path,
+ raw_root: Path,
+ limit_reports: int | None = None,
+):
+ reports = discover_reports(ocr_root)
+ if limit_reports is not None:
+ reports = reports[:limit_reports]
+
+ for report in reports:
+ pages = load_pages(report.mmd_path)
+ raw_dir, raw_status = resolve_raw_dir(report, raw_root)
+ page_pngs = list_page_pngs(raw_dir)
+ mmd_page_count = len(pages)
+ png_page_count = len(page_pngs)
+
+ for page_index, page_text in enumerate(pages):
+ raw_png = page_png_for(page_pngs, page_index)
+ warnings = make_mapping_warnings(
+ raw_dir=raw_dir,
+ page_pngs=page_pngs,
+ page_index=page_index,
+ mmd_page_count=mmd_page_count,
+ )
+ reasons = detect_candidate_reasons(page_text)
+ item_id = f"{report.industry_slug}/{report.name}/page_{page_index:04d}"
+ yield PageItem(
+ item_id=item_id,
+ industry_slug=report.industry_slug,
+ report_name=report.name,
+ exchange=report.exchange,
+ ticker=report.ticker,
+ year=report.year,
+ page_index=page_index,
+ page_number=page_index + 1,
+ ocr_root=str(ocr_root),
+ raw_root=str(raw_root),
+ report_dir=str(report.report_dir),
+ raw_dir=str(raw_dir) if raw_dir else None,
+ mmd_path=str(report.mmd_path),
+ raw_png_path=str(raw_png) if raw_png else None,
+ mmd_page_count=mmd_page_count,
+ png_page_count=png_page_count,
+ mapping_status=raw_status,
+ mapping_warnings=warnings,
+ candidate_reasons=reasons,
+ page_text_sha256=page_text_hash(page_text),
+ page_text_chars=len(page_text),
+ page_text_preview=text_preview(page_text),
+ page_text="",
+ )
+
+
+def iter_table_items(
+ *,
+ ocr_root: Path,
+ raw_root: Path,
+ limit_reports: int | None = None,
+):
+ reports = discover_reports(ocr_root)
+ if limit_reports is not None:
+ reports = reports[:limit_reports]
+
+ for report in reports:
+ table_source = resolve_table_source(report, raw_root)
+ if table_source is None:
+ continue
+
+ pages = load_pages(table_source.det_mmd_path)
+ raw_dir = table_source.report_dir
+ raw_status = table_source.mapping_status
+ page_pngs = table_source.page_pngs
+ mmd_page_count = len(pages)
+ png_page_count = len(page_pngs)
+ extra_warnings = (
+ [table_source.source_warning] if table_source.source_warning else []
+ )
+
+ for page_index, page_text in enumerate(pages):
+ blocks = parse_det_blocks(page_text)
+ if not blocks:
+ continue
+
+ warnings = make_mapping_warnings(
+ raw_dir=raw_dir,
+ page_pngs=page_pngs,
+ page_index=page_index,
+ mmd_page_count=mmd_page_count,
+ extra_warnings=extra_warnings,
+ )
+ raw_png = page_png_for(page_pngs, page_index)
+ table_index = 0
+
+ for block_index, block in enumerate(blocks):
+ if block.ref_type != "table" or not block.payload:
+ continue
+
+ context_before = nearby_context(blocks, block_index, direction=-1)
+ context_after = nearby_context(blocks, block_index, direction=1)
+ row_count, col_count = table_dimensions(block.payload)
+ focus_bboxes = [list(box) for box in block.bboxes]
+ focus_bbox = combined_bbox(focus_bboxes)
+ reasons = detect_table_reasons(
+ block.payload,
+ context_before=context_before,
+ context_after=context_after,
+ )
+ item_id = (
+ f"{report.industry_slug}/{report.name}/page_{page_index:04d}"
+ f"/table_{table_index:03d}"
+ )
+ preview_parts = [
+ context_before,
+ strip_html(block.payload),
+ context_after,
+ ]
+ yield PageItem(
+ item_id=item_id,
+ industry_slug=report.industry_slug,
+ report_name=report.name,
+ exchange=report.exchange,
+ ticker=report.ticker,
+ year=report.year,
+ page_index=page_index,
+ page_number=page_index + 1,
+ ocr_root=str(ocr_root),
+ raw_root=str(raw_root),
+ report_dir=str(table_source.report_dir),
+ raw_dir=str(raw_dir) if raw_dir else None,
+ mmd_path=str(table_source.mmd_path),
+ raw_png_path=str(raw_png) if raw_png else None,
+ mmd_page_count=mmd_page_count,
+ png_page_count=png_page_count,
+ mapping_status=raw_status,
+ mapping_warnings=warnings,
+ candidate_reasons=reasons,
+ page_text_sha256=page_text_hash(block.payload),
+ page_text_chars=len(block.payload),
+ page_text_preview=text_preview(
+ "\n".join(part for part in preview_parts if part)
+ ),
+ page_text="",
+ item_kind="table",
+ det_mmd_path=str(table_source.det_mmd_path),
+ table_index=table_index,
+ table_row_count=row_count,
+ table_col_count=col_count,
+ focus_bbox=focus_bbox,
+ focus_bboxes=focus_bboxes,
+ table_html=block.payload,
+ context_before=context_before,
+ context_after=context_after,
+ )
+ table_index += 1
+
+
+def new_summary_state() -> dict[str, Any]:
+ return {
+ "report_names": set(),
+ "page_keys": set(),
+ "items_total": 0,
+ "page_items_total": 0,
+ "table_items_total": 0,
+ "mapping_status_counts": {},
+ "mapping_warning_counts": {},
+ "candidate_reason_counts": {},
+ }
+
+
+def update_summary_state(state: dict[str, Any], item: PageItem) -> None:
+ state["report_names"].add(item.report_name)
+ state["page_keys"].add((item.report_name, item.page_index))
+ state["items_total"] += 1
+ if item.item_kind == "table":
+ state["table_items_total"] += 1
+ else:
+ state["page_items_total"] += 1
+ statuses = state["mapping_status_counts"]
+ statuses[item.mapping_status] = statuses.get(item.mapping_status, 0) + 1
+ warnings = state["mapping_warning_counts"]
+ for warning in item.mapping_warnings:
+ warnings[warning] = warnings.get(warning, 0) + 1
+ reasons = state["candidate_reason_counts"]
+ for reason in item.candidate_reasons:
+ reasons[reason] = reasons.get(reason, 0) + 1
+
+
+def finish_summary_state(
+ state: dict[str, Any], queue: list[PageItem]
+) -> dict[str, Any]:
+ return {
+ "reports_total": len(state["report_names"]),
+ "pages_total": len(state["page_keys"]),
+ "items_total": state["items_total"],
+ "page_items_total": state["page_items_total"],
+ "table_items_total": state["table_items_total"],
+ "queue_reports": len({item.report_name for item in queue}),
+ "queue_pages": len({(item.report_name, item.page_index) for item in queue}),
+ "queue_items": len(queue),
+ "queue_table_items": sum(1 for item in queue if item.item_kind == "table"),
+ "mapping_status_counts": state["mapping_status_counts"],
+ "mapping_warning_counts": state["mapping_warning_counts"],
+ "candidate_reason_counts": state["candidate_reason_counts"],
+ }
+
+
+def select_queue(
+ items: list[PageItem],
+ *,
+ queue_mode: str,
+ sample_size: int | None = None,
+ seed: int = 17,
+ limit: int | None = None,
+) -> list[PageItem]:
+ if queue_mode == "all":
+ selected = list(items)
+ elif queue_mode == "table-candidates":
+ selected = [item for item in items if item.candidate_reasons]
+ elif queue_mode == "tables":
+ selected = list(items)
+ if sample_size is not None:
+ rng = random.Random(seed)
+ selected = rng.sample(selected, min(sample_size, len(selected)))
+ selected.sort(
+ key=lambda item: (
+ item.industry_slug,
+ item.report_name,
+ item.page_index,
+ item.table_index or -1,
+ )
+ )
+ elif queue_mode == "sample":
+ size = sample_size if sample_size is not None else 100
+ rng = random.Random(seed)
+ selected = rng.sample(items, min(size, len(items)))
+ selected.sort(
+ key=lambda item: (item.industry_slug, item.report_name, item.page_index)
+ )
+ else:
+ raise ValueError(f"unknown queue mode: {queue_mode}")
+
+ if limit is not None:
+ selected = selected[:limit]
+ return selected
+
+
+def build_queue(
+ *,
+ ocr_root: Path,
+ raw_root: Path,
+ queue_mode: str = "tables",
+ sample_size: int | None = None,
+ seed: int = 17,
+ limit: int | None = None,
+ limit_reports: int | None = None,
+) -> tuple[list[PageItem], dict[str, Any]]:
+ if queue_mode not in {"all", "table-candidates", "sample", "tables"}:
+ raise ValueError(f"unknown queue mode: {queue_mode}")
+
+ queue: list[PageItem] = []
+ summary_state = new_summary_state()
+ rng = random.Random(seed)
+ sample_seen = 0
+ sample_target = sample_size if sample_size is not None else 100
+ scan_stopped_by_limit = False
+ item_iterator = iter_table_items if queue_mode == "tables" else iter_page_items
+
+ for item in item_iterator(
+ ocr_root=ocr_root,
+ raw_root=raw_root,
+ limit_reports=limit_reports,
+ ):
+ update_summary_state(summary_state, item)
+ if queue_mode == "sample" or (
+ queue_mode == "tables" and sample_size is not None
+ ):
+ sample_seen += 1
+ if len(queue) < sample_target:
+ queue.append(item)
+ else:
+ replace_at = rng.randint(0, sample_seen - 1)
+ if replace_at < sample_target:
+ queue[replace_at] = item
+ continue
+
+ include_item = queue_mode in {"all", "tables"} or bool(item.candidate_reasons)
+ if not include_item:
+ continue
+ queue.append(item)
+ if limit is not None and len(queue) >= limit:
+ scan_stopped_by_limit = True
+ break
+
+ if queue_mode == "sample" or (queue_mode == "tables" and sample_size is not None):
+ queue.sort(
+ key=lambda item: (
+ item.industry_slug,
+ item.report_name,
+ item.page_index,
+ item.table_index or -1,
+ )
+ )
+ if limit is not None:
+ queue = queue[:limit]
+
+ summary = finish_summary_state(summary_state, queue)
+ summary.update(
+ {
+ "queue_mode": queue_mode,
+ "sample_size": sample_size,
+ "seed": seed,
+ "limit": limit,
+ "limit_reports": limit_reports,
+ "scan_stopped_by_limit": scan_stopped_by_limit,
+ "ocr_root": str(ocr_root),
+ "raw_root": str(raw_root),
+ }
+ )
+ return queue, summary
+
+
+def summarize_items(all_items: list[PageItem], queue: list[PageItem]) -> dict[str, Any]:
+ report_names = {item.report_name for item in all_items}
+ queue_reports = {item.report_name for item in queue}
+ page_keys = {(item.report_name, item.page_index) for item in all_items}
+ warnings: dict[str, int] = {}
+ statuses: dict[str, int] = {}
+ reason_counts: dict[str, int] = {}
+ for item in all_items:
+ statuses[item.mapping_status] = statuses.get(item.mapping_status, 0) + 1
+ for warning in item.mapping_warnings:
+ warnings[warning] = warnings.get(warning, 0) + 1
+ for reason in item.candidate_reasons:
+ reason_counts[reason] = reason_counts.get(reason, 0) + 1
+ return {
+ "reports_total": len(report_names),
+ "pages_total": len(page_keys),
+ "items_total": len(all_items),
+ "table_items_total": sum(1 for item in all_items if item.item_kind == "table"),
+ "queue_reports": len(queue_reports),
+ "queue_pages": len({(item.report_name, item.page_index) for item in queue}),
+ "queue_items": len(queue),
+ "mapping_status_counts": statuses,
+ "mapping_warning_counts": warnings,
+ "candidate_reason_counts": reason_counts,
+ }
+
+
+def write_json(path: Path, payload: Any) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ tmp = path.with_suffix(path.suffix + ".tmp")
+ tmp.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+ tmp.replace(path)
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="Build an OCR page annotation queue.")
+ parser.add_argument("--ocr-root", type=Path, default=DEFAULT_OCR_ROOT)
+ parser.add_argument("--raw-root", type=Path, default=DEFAULT_RAW_ROOT)
+ parser.add_argument(
+ "--queue-mode",
+ choices=["all", "table-candidates", "sample", "tables"],
+ default="tables",
+ )
+ parser.add_argument("--sample-size", type=int, default=None)
+ parser.add_argument("--seed", type=int, default=17)
+ parser.add_argument("--limit", type=int, default=None, help="Maximum queued pages.")
+ parser.add_argument(
+ "--limit-reports",
+ type=int,
+ default=None,
+ help="Read only the first N reports before queue selection.",
+ )
+ parser.add_argument(
+ "--output", type=Path, default=None, help="Optional manifest JSON path."
+ )
+ parser.add_argument("--check", action="store_true", help="Print summary and exit.")
+ return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+ args = build_arg_parser().parse_args(argv)
+ queue, summary = build_queue(
+ ocr_root=args.ocr_root,
+ raw_root=args.raw_root,
+ queue_mode=args.queue_mode,
+ sample_size=args.sample_size,
+ seed=args.seed,
+ limit=args.limit,
+ limit_reports=args.limit_reports,
+ )
+
+ payload = {
+ "summary": summary,
+ "items": [item.to_manifest_record() for item in queue],
+ }
+ if args.output:
+ write_json(args.output, payload)
+ if args.check or not args.output:
+ print(json.dumps(summary, indent=2))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/annotation_OCR/server.py b/annotation_OCR/server.py
new file mode 100644
index 0000000..727a04a
--- /dev/null
+++ b/annotation_OCR/server.py
@@ -0,0 +1,570 @@
+"""Browser-based OCR annotation server."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+
+import bleach
+import markdown as markdown_lib
+from flask import Flask, abort, jsonify, redirect, render_template, request, send_file
+
+from ocr_index import DEFAULT_OCR_ROOT, DEFAULT_RAW_ROOT, build_queue, load_pages
+from store import (
+ create_session,
+ list_sessions,
+ load_current_annotations,
+ load_manifest,
+ load_metadata,
+ save_annotation,
+ session_dir,
+ write_summary_files,
+)
+
+
+HERE = Path(__file__).resolve().parent
+DEFAULT_TABLE_MANIFEST = HERE / "manifests" / "tables_5000.json"
+IMAGE_REF_RE = re.compile(r"(!\[[^\]]*\]\()((?:\./)?images/[^)\s]+)(\))")
+
+ALLOWED_TAGS = set(bleach.sanitizer.ALLOWED_TAGS).union(
+ {
+ "p",
+ "br",
+ "pre",
+ "code",
+ "hr",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "table",
+ "thead",
+ "tbody",
+ "tfoot",
+ "tr",
+ "th",
+ "td",
+ "img",
+ "blockquote",
+ "del",
+ }
+)
+ALLOWED_ATTRIBUTES = {
+ **bleach.sanitizer.ALLOWED_ATTRIBUTES,
+ "a": ["href", "title", "rel", "target"],
+ "img": ["src", "alt", "title"],
+ "th": ["align", "colspan", "rowspan"],
+ "td": ["align", "colspan", "rowspan"],
+}
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="Run the OCR annotation web UI.")
+ parser.add_argument("--ocr-root", type=Path, default=DEFAULT_OCR_ROOT)
+ parser.add_argument("--raw-root", type=Path, default=DEFAULT_RAW_ROOT)
+ parser.add_argument(
+ "--session-id", default=None, help="Resume an existing session."
+ )
+ parser.add_argument("--session-name", default="OCR annotation session")
+ parser.add_argument("--annotator", default="anonymous")
+ parser.add_argument(
+ "--study-bundle",
+ type=Path,
+ default=None,
+ help="Optional per-session study bundle. When set, each new session gets the next precomputed session queue.",
+ )
+ parser.add_argument(
+ "--manifest-path",
+ type=Path,
+ default=DEFAULT_TABLE_MANIFEST if DEFAULT_TABLE_MANIFEST.is_file() else None,
+ help="Optional precomputed queue manifest to reuse instead of rescanning OCR files.",
+ )
+ parser.add_argument(
+ "--queue-mode",
+ choices=["all", "table-candidates", "sample", "tables"],
+ default="tables",
+ )
+ parser.add_argument("--sample-size", type=int, default=5000)
+ parser.add_argument("--seed", type=int, default=17)
+ parser.add_argument("--limit", type=int, default=None, help="Maximum queued items.")
+ parser.add_argument(
+ "--limit-reports",
+ type=int,
+ default=None,
+ help="Read only the first N reports before queue selection.",
+ )
+ parser.add_argument("--host", default="127.0.0.1")
+ parser.add_argument("--port", type=int, default=5050)
+ parser.add_argument("--debug", action="store_true")
+ return parser
+
+
+def prepare_session(args: argparse.Namespace) -> str:
+ if args.session_id:
+ metadata = load_metadata(args.session_id)
+ return metadata["session_id"]
+
+ manifest_items, index_summary, study_config = resolve_session_source(
+ study_bundle_path=args.study_bundle,
+ manifest_path=args.manifest_path,
+ ocr_root=args.ocr_root,
+ raw_root=args.raw_root,
+ queue_mode=args.queue_mode,
+ sample_size=args.sample_size,
+ seed=args.seed,
+ limit=args.limit,
+ limit_reports=args.limit_reports,
+ )
+ config = {
+ "ocr_root": str(args.ocr_root),
+ "raw_root": str(args.raw_root),
+ "study_bundle_path": str(args.study_bundle.resolve())
+ if args.study_bundle
+ else None,
+ "manifest_path": str(args.manifest_path) if args.manifest_path else None,
+ "queue_mode": args.queue_mode,
+ "sample_size": args.sample_size,
+ "seed": args.seed,
+ "limit": args.limit,
+ "limit_reports": args.limit_reports,
+ **study_config,
+ }
+ metadata = create_session(
+ session_name=args.session_name,
+ annotator=args.annotator,
+ manifest_items=manifest_items,
+ index_summary=index_summary,
+ config=config,
+ )
+ return metadata["session_id"]
+
+
+@lru_cache(maxsize=64)
+def cached_pages(mmd_path: str) -> tuple[str, ...]:
+ return tuple(load_pages(Path(mmd_path)))
+
+
+@lru_cache(maxsize=16)
+def cached_manifest(session_id: str) -> tuple[dict[str, Any], ...]:
+ return tuple(load_manifest(session_id))
+
+
+def load_precomputed_manifest(
+ manifest_path: Path,
+) -> tuple[list[dict[str, Any]], dict[str, Any]]:
+ payload = json.loads(manifest_path.read_text(encoding="utf-8"))
+ items = payload.get("items")
+ if not isinstance(items, list):
+ raise ValueError(f"invalid manifest items in {manifest_path}")
+ summary = payload.get("summary") or {}
+ if not isinstance(summary, dict):
+ raise ValueError(f"invalid manifest summary in {manifest_path}")
+ summary = {**summary, "manifest_path": str(manifest_path)}
+ return items, summary
+
+
+def load_study_bundle(bundle_path: Path) -> dict[str, Any]:
+ payload = json.loads(bundle_path.read_text(encoding="utf-8"))
+ sessions = payload.get("sessions")
+ if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance(
+ sessions, list
+ ):
+ raise ValueError(f"invalid study bundle in {bundle_path}")
+ return payload
+
+
+def claimed_study_slots(bundle_path: Path) -> set[int]:
+ resolved = str(bundle_path.resolve())
+ claimed: set[int] = set()
+ for metadata in list_sessions():
+ config = metadata.get("config") or {}
+ if config.get("study_bundle_path") != resolved:
+ continue
+ slot = config.get("study_slot")
+ if isinstance(slot, int):
+ claimed.add(slot)
+ elif isinstance(slot, str) and slot.isdigit():
+ claimed.add(int(slot))
+ return claimed
+
+
+def allocate_study_session(
+ bundle_path: Path,
+) -> tuple[list[dict[str, Any]], dict[str, Any], dict[str, Any]]:
+ bundle = load_study_bundle(bundle_path)
+ claimed = claimed_study_slots(bundle_path)
+ sessions = bundle["sessions"]
+ next_session = None
+ for entry in sessions:
+ slot = entry.get("slot")
+ if isinstance(slot, int) and slot not in claimed:
+ next_session = entry
+ break
+ if next_session is None:
+ raise ValueError(f"all study sessions already assigned for {bundle_path}")
+
+ items = next_session.get("items")
+ if not isinstance(items, list):
+ raise ValueError(f"invalid study session items in {bundle_path}")
+ summary = bundle.get("summary") or {}
+ if not isinstance(summary, dict):
+ summary = {}
+ slot = int(next_session["slot"])
+ summary = {
+ **summary,
+ "study_bundle_path": str(bundle_path.resolve()),
+ "study_slot": slot,
+ "study_target_items": next_session.get("target_items"),
+ "study_agreement_items": next_session.get("agreement_items"),
+ "study_single_items": next_session.get("single_items"),
+ }
+ config = {
+ "study_slot": slot,
+ "study_target_items": next_session.get("target_items"),
+ "study_agreement_items": next_session.get("agreement_items"),
+ "study_single_items": next_session.get("single_items"),
+ }
+ return items, summary, config
+
+
+def resolve_session_source(
+ *,
+ study_bundle_path: Path | None,
+ manifest_path: Path | None,
+ ocr_root: Path,
+ raw_root: Path,
+ queue_mode: str,
+ sample_size: int | None,
+ seed: int,
+ limit: int | None,
+ limit_reports: int | None,
+) -> tuple[list[dict[str, Any]], dict[str, Any], dict[str, Any]]:
+ if study_bundle_path is not None:
+ items, summary, config = allocate_study_session(study_bundle_path)
+ if limit is not None:
+ items = items[:limit]
+ summary = {**summary, "limit": limit}
+ config = {**config, "limit": limit}
+ return items, summary, config
+
+ if manifest_path is not None:
+ items, summary = load_precomputed_manifest(manifest_path)
+ if limit is not None:
+ items = items[:limit]
+ summary = {**summary, "limit": limit}
+ return items, summary, {}
+
+ queue, index_summary = build_queue(
+ ocr_root=ocr_root,
+ raw_root=raw_root,
+ queue_mode=queue_mode,
+ sample_size=sample_size,
+ seed=seed,
+ limit=limit,
+ limit_reports=limit_reports,
+ )
+ return [item.to_manifest_record() for item in queue], index_summary, {}
+
+
+def get_item_or_404(session_id: str, index: int) -> dict[str, Any]:
+ manifest = cached_manifest(session_id)
+ if index < 0 or index >= len(manifest):
+ abort(404, description="item index out of range")
+ return manifest[index]
+
+
+def item_page_text(item: dict[str, Any]) -> str:
+ if item.get("item_kind") == "table":
+ return str(item.get("table_html") or "")
+ pages = cached_pages(item["mmd_path"])
+ page_index = int(item.get("page_index", 0))
+ if page_index < 0 or page_index >= len(pages):
+ return ""
+ return pages[page_index]
+
+
+def omit_markdown_image_refs(markdown_text: str) -> str:
+ return IMAGE_REF_RE.sub(
+ lambda match: f"_[image omitted: {match.group(2)}]_", markdown_text
+ )
+
+
+def rewrite_markdown_image_refs(markdown_text: str, session_id: str, index: int) -> str:
+ def replace_md(match: re.Match[str]) -> str:
+ rel_path = match.group(2).lstrip("./")
+ src = f"/api/session/{session_id}/item/{index}/inline-image/{rel_path}"
+ return f"{match.group(1)}{src}{match.group(3)}"
+
+ return IMAGE_REF_RE.sub(replace_md, markdown_text)
+
+
+def render_markdown_page(
+ markdown_text: str,
+ *,
+ session_id: str,
+ index: int,
+ show_inline_images: bool,
+) -> str:
+ if show_inline_images:
+ rewritten = rewrite_markdown_image_refs(markdown_text, session_id, index)
+ else:
+ rewritten = omit_markdown_image_refs(markdown_text)
+ html = markdown_lib.markdown(
+ rewritten,
+ extensions=["tables", "fenced_code", "sane_lists", "nl2br"],
+ output_format="html5",
+ )
+ return bleach.clean(
+ html,
+ tags=ALLOWED_TAGS,
+ attributes=ALLOWED_ATTRIBUTES,
+ protocols=["http", "https", "mailto", "data"],
+ )
+
+
+def safe_child_path(root: Path, relative_path: str) -> Path:
+ candidate = Path(relative_path)
+ if candidate.is_absolute() or ".." in candidate.parts:
+ abort(400, description="unsafe path")
+ resolved_root = root.resolve()
+ target = (resolved_root / candidate).resolve()
+ if not target.is_relative_to(resolved_root):
+ abort(400, description="unsafe path")
+ return target
+
+
+def progress_payload(session_id: str) -> dict[str, Any]:
+ metadata = load_metadata(session_id)
+ manifest = cached_manifest(session_id)
+ current = load_current_annotations(session_id)
+ status_counts: dict[str, int] = {}
+ for item in manifest:
+ status = current.get(item["item_id"], {}).get("overall_status", "unreviewed")
+ status_counts[status] = status_counts.get(status, 0) + 1
+
+ next_unreviewed_index = None
+ for index, item in enumerate(manifest):
+ if item["item_id"] not in current:
+ next_unreviewed_index = index
+ break
+
+ return {
+ "metadata": metadata,
+ "item_count": len(manifest),
+ "reviewed_count": len(current),
+ "status_counts": status_counts,
+ "next_unreviewed_index": next_unreviewed_index,
+ }
+
+
+def create_app(default_session_id: str | None, build_defaults: dict[str, Any]) -> Flask:
+ app = Flask(__name__, template_folder="templates", static_folder="static")
+ app.config["DEFAULT_SESSION_ID"] = default_session_id
+ app.config["BUILD_DEFAULTS"] = build_defaults
+
+ @app.get("/")
+ def index() -> Any:
+ # If ?session= in URL, serve the annotation UI for that session
+ session_from_url = request.args.get("session")
+ if session_from_url:
+ return render_template("index.html", session_id=session_from_url)
+ # If server was started with a pre-created session, redirect to it
+ if default_session_id:
+ return redirect(f"/?session={default_session_id}")
+ # Otherwise show the landing / session picker page
+ return render_template("landing.html")
+
+ @app.get("/api/sessions")
+ def api_sessions() -> Any:
+ return jsonify(
+ {
+ "sessions": list_sessions(),
+ "default_session_id": default_session_id or None,
+ }
+ )
+
+ @app.post("/api/sessions")
+ def api_create_session() -> Any:
+ payload = request.get_json(force=True, silent=True) or {}
+ defaults = app.config["BUILD_DEFAULTS"]
+ queue_mode = payload.get("queue_mode") or defaults["queue_mode"]
+ study_bundle_value = payload.get("study_bundle_path") or defaults.get(
+ "study_bundle_path"
+ )
+ study_bundle_path = Path(study_bundle_value) if study_bundle_value else None
+ manifest_path_value = payload.get("manifest_path") or defaults.get(
+ "manifest_path"
+ )
+ manifest_path = Path(manifest_path_value) if manifest_path_value else None
+ manifest_items, index_summary, study_config = resolve_session_source(
+ study_bundle_path=study_bundle_path,
+ manifest_path=manifest_path,
+ ocr_root=Path(payload.get("ocr_root") or defaults["ocr_root"]),
+ raw_root=Path(payload.get("raw_root") or defaults["raw_root"]),
+ queue_mode=queue_mode,
+ sample_size=payload.get("sample_size", defaults.get("sample_size")),
+ seed=int(payload.get("seed", defaults["seed"])),
+ limit=payload.get("limit", defaults.get("limit")),
+ limit_reports=payload.get("limit_reports", defaults.get("limit_reports")),
+ )
+ config = {
+ "ocr_root": payload.get("ocr_root") or defaults["ocr_root"],
+ "raw_root": payload.get("raw_root") or defaults["raw_root"],
+ "study_bundle_path": str(study_bundle_path.resolve())
+ if study_bundle_path
+ else None,
+ "manifest_path": str(manifest_path) if manifest_path else None,
+ "queue_mode": queue_mode,
+ "sample_size": payload.get("sample_size", defaults.get("sample_size")),
+ "seed": int(payload.get("seed", defaults["seed"])),
+ "limit": payload.get("limit", defaults.get("limit")),
+ "limit_reports": payload.get(
+ "limit_reports", defaults.get("limit_reports")
+ ),
+ **study_config,
+ }
+ metadata = create_session(
+ session_name=str(payload.get("session_name") or "OCR annotation session"),
+ annotator=str(payload.get("annotator") or "anonymous"),
+ manifest_items=manifest_items,
+ index_summary=index_summary,
+ config=config,
+ )
+ cached_manifest.cache_clear()
+ return jsonify(
+ {"metadata": metadata, "progress": progress_payload(metadata["session_id"])}
+ )
+
+ @app.get("/api/session/")
+ def api_session(session_id: str) -> Any:
+ return jsonify(progress_payload(session_id))
+
+ @app.get("/api/session//item/")
+ def api_item(session_id: str, index: int) -> Any:
+ manifest = cached_manifest(session_id)
+ item = get_item_or_404(session_id, index)
+ text = item_page_text(item)
+ annotations = load_current_annotations(session_id)
+ show_inline_images = request.args.get("inline_images", "1") != "0"
+ next_image_url = None
+ if index + 1 < len(manifest) and manifest[index + 1].get("raw_png_path"):
+ next_image_url = f"/api/session/{session_id}/item/{index + 1}/raw-image"
+ return jsonify(
+ {
+ "index": index,
+ "item_count": len(manifest),
+ "item": item,
+ "annotation": annotations.get(item["item_id"]),
+ "page_text": text,
+ "markdown_html": render_markdown_page(
+ text,
+ session_id=session_id,
+ index=index,
+ show_inline_images=show_inline_images,
+ ),
+ "inline_images": show_inline_images,
+ "image_url": f"/api/session/{session_id}/item/{index}/raw-image",
+ "next_image_url": next_image_url,
+ }
+ )
+
+ @app.get("/api/session//item//raw-image")
+ def api_raw_image(session_id: str, index: int) -> Any:
+ item = get_item_or_404(session_id, index)
+ raw_png_path = item.get("raw_png_path")
+ if not raw_png_path:
+ abort(404, description="raw page image missing")
+ target = Path(raw_png_path).resolve()
+ raw_root = Path(item.get("raw_root") or "/").resolve()
+ if not target.is_relative_to(raw_root):
+ abort(400, description="raw image outside raw root")
+ if not target.is_file():
+ abort(404, description="raw page image missing")
+ return send_file(target, conditional=True, max_age=86400)
+
+ @app.get("/api/session//item//inline-image/")
+ def api_inline_image(session_id: str, index: int, rel_path: str) -> Any:
+ item = get_item_or_404(session_id, index)
+ report_dir = Path(item["report_dir"])
+ target = safe_child_path(report_dir, rel_path)
+ if not target.is_file():
+ abort(404, description="inline OCR image missing")
+ return send_file(target, conditional=True, max_age=86400)
+
+ @app.post("/api/session//annotation")
+ def api_save_annotation(session_id: str) -> Any:
+ payload = request.get_json(force=True, silent=False) or {}
+ item_id = payload.get("item_id")
+ if not item_id:
+ abort(400, description="missing item_id")
+ record = save_annotation(
+ session_id=session_id, item_id=str(item_id), payload=payload
+ )
+ return jsonify({"annotation": record, "progress": progress_payload(session_id)})
+
+ @app.get("/api/session//progress")
+ def api_progress(session_id: str) -> Any:
+ return jsonify(progress_payload(session_id))
+
+ @app.post("/api/session//summarize")
+ def api_summarize(session_id: str) -> Any:
+ paths = write_summary_files(session_id)
+ return jsonify({"paths": paths, "progress": progress_payload(session_id)})
+
+ @app.get("/api/session//summary.csv")
+ def api_summary_csv(session_id: str) -> Any:
+ write_summary_files(session_id)
+ return send_file(session_dir(session_id) / "summary.csv", as_attachment=True)
+
+ @app.get("/api/session//summary.md")
+ def api_summary_md(session_id: str) -> Any:
+ write_summary_files(session_id)
+ return send_file(session_dir(session_id) / "summary.md", as_attachment=True)
+
+ return app
+
+
+def main(argv: list[str] | None = None) -> int:
+ args = build_arg_parser().parse_args(argv)
+ # Session creation is now optional — if no --session-id given and
+ # --session-name is the default placeholder, start headless so users
+ # can create/resume sessions from the browser landing page.
+ session_id: str | None = None
+ if args.session_id:
+ session_id = prepare_session(args)
+ elif args.annotator != "anonymous" or args.session_name != "OCR annotation session":
+ session_id = prepare_session(args)
+
+ build_defaults = {
+ "ocr_root": str(args.ocr_root),
+ "raw_root": str(args.raw_root),
+ "study_bundle_path": str(args.study_bundle.resolve())
+ if args.study_bundle
+ else None,
+ "manifest_path": str(args.manifest_path) if args.manifest_path else None,
+ "queue_mode": args.queue_mode,
+ "sample_size": args.sample_size,
+ "seed": args.seed,
+ "limit": args.limit,
+ "limit_reports": args.limit_reports,
+ }
+ app = create_app(session_id, build_defaults)
+ if session_id:
+ print(f"Annotation session: {session_id}")
+ else:
+ print(
+ "Starting in headless mode — users will create sessions from the browser."
+ )
+ print(f"Open: http://{args.host}:{args.port}")
+ app.run(host=args.host, port=args.port, debug=args.debug)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/annotation_OCR/static/app.js b/annotation_OCR/static/app.js
new file mode 100644
index 0000000..d9a18c0
--- /dev/null
+++ b/annotation_OCR/static/app.js
@@ -0,0 +1,528 @@
+const state = {
+ sessionId: window.OCR_ANNOTATION_SESSION_ID
+ || new URLSearchParams(window.location.search).get('session')
+ || window.OCR_ANNOTATION_DEFAULT_SESSION_ID,
+ index: 0,
+ itemCount: 0,
+ item: null,
+ overallStatus: 'unreviewed',
+ startedAt: null,
+ zoom: 1,
+ showingRaw: false,
+ showInlineImages: true,
+ saving: false,
+ prefetchImage: null,
+};
+
+const IMAGE_STAGE_PADDING = 16;
+const DET_COORD_MAX = 999;
+const FOCUS_VIEWPORT_MARGIN = 12;
+const FOCUS_BOX_OVERSCAN_X = 1.06;
+const FOCUS_BOX_OVERSCAN_Y = 1.08;
+
+const els = {
+ sessionTitle: document.getElementById('sessionTitle'),
+ sessionMeta: document.getElementById('sessionMeta'),
+ progressText: document.getElementById('progressText'),
+ progressBar: document.getElementById('progressBar'),
+ prevButton: document.getElementById('prevButton'),
+ nextButton: document.getElementById('nextButton'),
+ skipReviewedButton: document.getElementById('skipReviewedButton'),
+ helpButton: document.getElementById('helpButton'),
+ imageStage: document.getElementById('imageStage'),
+ imageCanvas: document.getElementById('imageCanvas'),
+ rawImage: document.getElementById('rawImage'),
+ imageOverlay: document.getElementById('imageOverlay'),
+ imageMissing: document.getElementById('imageMissing'),
+ imageSubtitle: document.getElementById('imageSubtitle'),
+ markdownSubtitle: document.getElementById('markdownSubtitle'),
+ markdownPreview: document.getElementById('markdownPreview'),
+ rawMarkdown: document.getElementById('rawMarkdown'),
+ inlineImagesToggle: document.getElementById('inlineImagesToggle'),
+ toggleRawButton: document.getElementById('toggleRawButton'),
+ zoomOutButton: document.getElementById('zoomOutButton'),
+ zoomResetButton: document.getElementById('zoomResetButton'),
+ zoomInButton: document.getElementById('zoomInButton'),
+ refocusButton: document.getElementById('refocusButton'),
+ reportName: document.getElementById('reportName'),
+ industryValue: document.getElementById('industryValue'),
+ tickerValue: document.getElementById('tickerValue'),
+ pageValue: document.getElementById('pageValue'),
+ signalsValue: document.getElementById('signalsValue'),
+ mappingValue: document.getElementById('mappingValue'),
+ notesInput: document.getElementById('notesInput'),
+ saveButton: document.getElementById('saveButton'),
+ saveStatus: document.getElementById('saveStatus'),
+ summaryCsvLink: document.getElementById('summaryCsvLink'),
+ summaryMdLink: document.getElementById('summaryMdLink'),
+ helpDialog: document.getElementById('helpDialog'),
+};
+
+function apiJson(url, options = {}) {
+ return fetch(url, {
+ headers: { 'Content-Type': 'application/json' },
+ ...options,
+ }).then(async (response) => {
+ if (!response.ok) {
+ const text = await response.text();
+ throw new Error(text || `${response.status} ${response.statusText}`);
+ }
+ return response.json();
+ });
+}
+
+function statusMessage(message, tone = 'neutral') {
+ els.saveStatus.textContent = message;
+ els.saveStatus.dataset.tone = tone;
+}
+
+function formatList(values) {
+ if (!values || values.length === 0) return 'none';
+ return values.join(', ');
+}
+
+function updateProgress(progress) {
+ const metadata = progress.metadata || {};
+ state.itemCount = progress.item_count || 0;
+ els.sessionTitle.textContent = metadata.session_name || metadata.session_id || 'Session';
+ els.sessionMeta.textContent = `${metadata.annotator || 'anonymous'} · ${metadata.session_id || state.sessionId}`;
+ const reviewed = progress.reviewed_count || 0;
+ const total = progress.item_count || 0;
+ els.progressText.textContent = `${reviewed} / ${total} reviewed`;
+ els.progressBar.style.width = `${total ? Math.round((reviewed / total) * 100) : 0}%`;
+ els.summaryCsvLink.href = `/api/session/${state.sessionId}/summary.csv`;
+ els.summaryMdLink.href = `/api/session/${state.sessionId}/summary.md`;
+}
+
+function setOverall(status) {
+ state.overallStatus = status;
+ document.querySelectorAll('.status-button').forEach((button) => {
+ button.classList.toggle('active', button.dataset.status === status);
+ });
+}
+
+function loadAnnotation(annotation) {
+ setOverall(annotation?.overall_status || 'unreviewed');
+ els.notesInput.value = annotation?.notes || '';
+}
+
+function fittedImageWidth() {
+ const stage = els.imageCanvas.parentElement;
+ const availableWidth = Math.max(240, stage.clientWidth - 32);
+ const availableHeight = Math.max(240, stage.clientHeight - 32);
+ const naturalWidth = els.rawImage.naturalWidth || availableWidth;
+ const naturalHeight = els.rawImage.naturalHeight || naturalWidth * 1.414;
+ const fitScale = Math.min(availableWidth / naturalWidth, availableHeight / naturalHeight);
+ return Math.max(120, Math.floor(naturalWidth * fitScale));
+}
+
+function applyZoom() {
+ const placement = imagePlacement();
+ els.imageCanvas.style.setProperty('--canvas-width', `${Math.round(placement.canvasWidth)}px`);
+ els.imageCanvas.style.setProperty('--canvas-height', `${Math.round(placement.canvasHeight)}px`);
+ els.imageCanvas.style.setProperty('--image-left', `${Math.round(placement.left)}px`);
+ els.imageCanvas.style.setProperty('--image-top', `${Math.round(placement.top)}px`);
+ els.imageCanvas.style.setProperty('--image-width', `${Math.round(placement.width)}px`);
+ els.imageCanvas.style.setProperty('--image-height', `${Math.round(placement.height)}px`);
+ renderFocusOverlay();
+ els.zoomResetButton.textContent = `${Math.round(state.zoom * 100)}%`;
+}
+
+function setZoom(value) {
+ state.zoom = Math.min(3, Math.max(0.35, value));
+ applyZoom();
+}
+
+function scheduleAfterLayout(callback) {
+ window.requestAnimationFrame(() => {
+ callback();
+ });
+}
+
+function hasTableFocus(item = state.item) {
+ return item?.item_kind === 'table'
+ && Array.isArray(item.focus_bbox)
+ && item.focus_bbox.length === 4
+ && item.focus_bbox.every((value) => Number.isFinite(value));
+}
+
+function baseFittedImageSize() {
+ const width = fittedImageWidth();
+ const naturalWidth = els.rawImage.naturalWidth || 1;
+ const naturalHeight = els.rawImage.naturalHeight || Math.max(1, naturalWidth * 1.414);
+ return {
+ width,
+ height: width * (naturalHeight / naturalWidth),
+ };
+}
+
+function scaledImageSize() {
+ const baseSize = baseFittedImageSize();
+ return {
+ width: baseSize.width * state.zoom,
+ height: baseSize.height * state.zoom,
+ };
+}
+
+function imagePlacement() {
+ const { width, height } = scaledImageSize();
+ const stageWidth = Math.max(1, els.imageStage.clientWidth);
+ const stageHeight = Math.max(1, els.imageStage.clientHeight);
+ const paddedWidth = width + (IMAGE_STAGE_PADDING * 2);
+ const paddedHeight = height + (IMAGE_STAGE_PADDING * 2);
+ const canvasWidth = Math.max(stageWidth, paddedWidth);
+ const canvasHeight = Math.max(stageHeight, paddedHeight);
+
+ return {
+ width,
+ height,
+ canvasWidth,
+ canvasHeight,
+ left: IMAGE_STAGE_PADDING + Math.max(0, (canvasWidth - paddedWidth) / 2),
+ top: IMAGE_STAGE_PADDING + Math.max(0, (canvasHeight - paddedHeight) / 2),
+ };
+}
+
+function tableFocusPoint() {
+ if (!hasTableFocus()) return null;
+ const [left, top, right, bottom] = state.item.focus_bbox;
+ return {
+ x: ((left + right) / 2) / DET_COORD_MAX,
+ y: ((top + bottom) / 2) / DET_COORD_MAX,
+ };
+}
+
+function viewportCenterPoint() {
+ if (!els.rawImage.naturalWidth || !els.rawImage.naturalHeight) {
+ return { x: 0.5, y: 0.5 };
+ }
+ const placement = imagePlacement();
+ return {
+ x: (els.imageStage.scrollLeft + (els.imageStage.clientWidth / 2) - placement.left) / placement.width,
+ y: (els.imageStage.scrollTop + (els.imageStage.clientHeight / 2) - placement.top) / placement.height,
+ };
+}
+
+function centerViewportOnPoint(point) {
+ if (!point || !els.rawImage.naturalWidth || !els.rawImage.naturalHeight) return;
+ const placement = imagePlacement();
+ els.imageStage.scrollLeft = Math.max(
+ 0,
+ placement.left + (point.x * placement.width) - (els.imageStage.clientWidth / 2),
+ );
+ els.imageStage.scrollTop = Math.max(
+ 0,
+ placement.top + (point.y * placement.height) - (els.imageStage.clientHeight / 2),
+ );
+}
+
+function tableBoxes() {
+ if (Array.isArray(state.item?.focus_bboxes) && state.item.focus_bboxes.length > 0) {
+ return state.item.focus_bboxes;
+ }
+ if (Array.isArray(state.item?.focus_bbox) && state.item.focus_bbox.length === 4) {
+ return [state.item.focus_bbox];
+ }
+ return [];
+}
+
+function bboxToDisplayRect(bbox) {
+ const placement = imagePlacement();
+ return {
+ left: placement.left + (bbox[0] / DET_COORD_MAX) * placement.width,
+ top: placement.top + (bbox[1] / DET_COORD_MAX) * placement.height,
+ width: Math.max(1, ((bbox[2] - bbox[0]) / DET_COORD_MAX) * placement.width),
+ height: Math.max(1, ((bbox[3] - bbox[1]) / DET_COORD_MAX) * placement.height),
+ };
+}
+
+function clearFocusOverlay() {
+ els.imageOverlay.replaceChildren();
+ els.imageOverlay.hidden = true;
+}
+
+function renderFocusOverlay() {
+ if (!hasTableFocus() || !els.rawImage.naturalWidth || !els.rawImage.naturalHeight || els.rawImage.hidden) {
+ clearFocusOverlay();
+ return;
+ }
+
+ const boxes = tableBoxes().map((bbox) => {
+ const rect = bboxToDisplayRect(bbox);
+ const box = document.createElement('div');
+ box.className = 'focus-box';
+ box.style.left = `${rect.left}px`;
+ box.style.top = `${rect.top}px`;
+ box.style.width = `${rect.width}px`;
+ box.style.height = `${rect.height}px`;
+ return box;
+ });
+
+ els.imageOverlay.replaceChildren(...boxes);
+ els.imageOverlay.hidden = false;
+}
+
+function zoomAroundPoint(value, point) {
+ setZoom(value);
+ scheduleAfterLayout(() => centerViewportOnPoint(point));
+}
+
+function adjustZoom(delta) {
+ const nextZoom = state.zoom + delta;
+ const anchorPoint = tableFocusPoint() || viewportCenterPoint();
+ zoomAroundPoint(nextZoom, anchorPoint);
+}
+
+function clamp(value, min, max) {
+ return Math.min(max, Math.max(min, value));
+}
+
+function focusCurrentItem({ resetZoom = true } = {}) {
+ if (!hasTableFocus() || !els.rawImage.naturalWidth || !els.rawImage.naturalHeight) {
+ if (resetZoom) setZoom(1);
+ return;
+ }
+
+ const [left, top, right, bottom] = state.item.focus_bbox;
+ const boxWidthRatio = Math.max(1 / DET_COORD_MAX, (right - left) / DET_COORD_MAX);
+ const boxHeightRatio = Math.max(1 / DET_COORD_MAX, (bottom - top) / DET_COORD_MAX);
+ const baseSize = baseFittedImageSize();
+ const focusPoint = tableFocusPoint();
+
+ if (resetZoom) {
+ const availableWidth = Math.max(
+ 180,
+ els.imageStage.clientWidth - FOCUS_VIEWPORT_MARGIN,
+ );
+ const availableHeight = Math.max(
+ 180,
+ els.imageStage.clientHeight - FOCUS_VIEWPORT_MARGIN,
+ );
+ const paddedWidth = Math.max(
+ 24,
+ boxWidthRatio * baseSize.width * FOCUS_BOX_OVERSCAN_X,
+ );
+ const paddedHeight = Math.max(
+ 24,
+ boxHeightRatio * baseSize.height * FOCUS_BOX_OVERSCAN_Y,
+ );
+ const targetZoom = clamp(
+ Math.min(availableWidth / paddedWidth, availableHeight / paddedHeight),
+ 0.35,
+ 3,
+ );
+ zoomAroundPoint(targetZoom, focusPoint);
+ return;
+ }
+
+ scheduleAfterLayout(() => centerViewportOnPoint(focusPoint));
+}
+
+async function loadProgress() {
+ const progress = await apiJson(`/api/session/${state.sessionId}`);
+ updateProgress(progress);
+ return progress;
+}
+
+function prefetchNextImage(url) {
+ if (!url) return;
+ state.prefetchImage = new Image();
+ state.prefetchImage.decoding = 'async';
+ state.prefetchImage.src = url;
+}
+
+function resetExtractedContentScroll() {
+ els.markdownPreview.scrollTop = 0;
+ els.rawMarkdown.scrollTop = 0;
+}
+
+async function loadItem(index) {
+ const safeIndex = Math.max(0, Math.min(index, Math.max(0, state.itemCount - 1)));
+ const inlineFlag = state.showInlineImages ? '1' : '0';
+ const data = await apiJson(`/api/session/${state.sessionId}/item/${safeIndex}?inline_images=${inlineFlag}`);
+ state.index = safeIndex;
+ state.item = data.item;
+ state.itemCount = data.item_count;
+ state.startedAt = new Date();
+
+ els.reportName.textContent = data.item.report_name;
+ els.industryValue.textContent = data.item.industry_slug;
+ els.tickerValue.textContent = `${data.item.exchange}:${data.item.ticker} · ${data.item.year}`;
+ els.pageValue.textContent = data.item.item_kind === 'table'
+ ? `${data.item.page_number} / ${data.item.mmd_page_count} · Table ${(data.item.table_index ?? 0) + 1}`
+ : `${data.item.page_number} / ${data.item.mmd_page_count}`;
+ els.signalsValue.textContent = formatList(data.item.candidate_reasons);
+ els.mappingValue.textContent = [data.item.mapping_status, ...data.item.mapping_warnings].filter(Boolean).join(' · ');
+ els.imageSubtitle.textContent = data.item.raw_png_path || 'No raw image path';
+ els.markdownSubtitle.textContent = `${data.item.page_text_chars} chars · ${data.item.page_text_sha256.slice(0, 12)}`;
+
+ els.markdownPreview.innerHTML = data.markdown_html || '';
+ els.rawMarkdown.textContent = data.page_text || '';
+ resetExtractedContentScroll();
+ clearFocusOverlay();
+
+ if (data.item.raw_png_path) {
+ els.rawImage.hidden = false;
+ els.imageMissing.hidden = true;
+ els.rawImage.src = `${data.image_url}?v=${encodeURIComponent(data.item.page_text_sha256)}`;
+ prefetchNextImage(data.next_image_url);
+ } else {
+ els.rawImage.hidden = true;
+ els.imageMissing.hidden = false;
+ els.rawImage.removeAttribute('src');
+ setZoom(1);
+ clearFocusOverlay();
+ }
+
+ loadAnnotation(data.annotation);
+ statusMessage(`Loaded item ${safeIndex + 1} of ${data.item_count}`);
+ els.prevButton.disabled = safeIndex === 0;
+ els.nextButton.disabled = safeIndex >= data.item_count - 1;
+}
+
+function annotationPayload(source = 'manual') {
+ return {
+ item_id: state.item.item_id,
+ overall_status: state.overallStatus,
+ notes: els.notesInput.value,
+ annotation_source: source,
+ review_duration_ms: state.startedAt ? new Date() - state.startedAt : null,
+ client_started_at_utc: state.startedAt ? state.startedAt.toISOString() : null,
+ client_updated_at_utc: new Date().toISOString(),
+ };
+}
+
+async function saveAnnotation(source = 'manual', advance = false) {
+ if (!state.item || state.saving) return;
+ state.saving = true;
+ els.saveButton.disabled = true;
+ statusMessage('Saving...');
+ try {
+ const data = await apiJson(`/api/session/${state.sessionId}/annotation`, {
+ method: 'POST',
+ body: JSON.stringify(annotationPayload(source)),
+ });
+ updateProgress(data.progress);
+ statusMessage('Saved', 'ok');
+ if (advance && state.index < state.itemCount - 1) {
+ await loadItem(state.index + 1);
+ await loadProgress();
+ }
+ } catch (error) {
+ statusMessage(`Save failed: ${error.message}`, 'error');
+ } finally {
+ state.saving = false;
+ els.saveButton.disabled = false;
+ }
+}
+
+function quickMark(status, source = 'shortcut') {
+ setOverall(status);
+ saveAnnotation(`${source}:${status}`, true);
+}
+
+async function go(delta) {
+ const target = state.index + delta;
+ if (target < 0 || target >= state.itemCount) return;
+ await loadItem(target);
+ await loadProgress();
+}
+
+async function goNextOpen() {
+ const progress = await loadProgress();
+ if (progress.next_unreviewed_index === null || progress.next_unreviewed_index === undefined) {
+ statusMessage('No open items');
+ return;
+ }
+ await loadItem(progress.next_unreviewed_index);
+}
+
+function toggleRawMarkdown() {
+ state.showingRaw = !state.showingRaw;
+ els.rawMarkdown.hidden = !state.showingRaw;
+ els.markdownPreview.hidden = state.showingRaw;
+ els.toggleRawButton.textContent = state.showingRaw ? 'Rendered' : 'Raw Markdown';
+}
+
+function inputHasFocus() {
+ const active = document.activeElement;
+ return active && ['TEXTAREA', 'INPUT', 'SELECT'].includes(active.tagName);
+}
+
+function setupEvents() {
+ els.prevButton.addEventListener('click', () => go(-1));
+ els.nextButton.addEventListener('click', () => go(1));
+ els.skipReviewedButton.addEventListener('click', goNextOpen);
+ els.saveButton.addEventListener('click', () => saveAnnotation('manual', false));
+ els.inlineImagesToggle.addEventListener('change', () => {
+ state.showInlineImages = els.inlineImagesToggle.checked;
+ loadItem(state.index);
+ });
+ els.toggleRawButton.addEventListener('click', toggleRawMarkdown);
+ els.zoomOutButton.addEventListener('click', () => adjustZoom(-0.15));
+ els.zoomInButton.addEventListener('click', () => adjustZoom(0.15));
+ els.zoomResetButton.addEventListener('click', () => focusCurrentItem({ resetZoom: true }));
+ els.refocusButton.addEventListener('click', () => focusCurrentItem({ resetZoom: true }));
+ els.helpButton.addEventListener('click', () => els.helpDialog.showModal());
+ els.rawImage.addEventListener('load', () => focusCurrentItem({ resetZoom: true }));
+ window.addEventListener('resize', () => {
+ const anchorPoint = tableFocusPoint() || viewportCenterPoint();
+ applyZoom();
+ scheduleAfterLayout(() => centerViewportOnPoint(anchorPoint));
+ });
+ document.querySelectorAll('.status-button').forEach((button) => {
+ button.addEventListener('click', () => quickMark(button.dataset.status, 'button'));
+ });
+
+ document.addEventListener('keydown', (event) => {
+ if (inputHasFocus()) return;
+ if (event.key === '?') {
+ event.preventDefault();
+ els.helpDialog.showModal();
+ } else if (event.key.toLowerCase() === 'a') {
+ event.preventDefault();
+ quickMark('ok');
+ } else if (event.key.toLowerCase() === 'r') {
+ event.preventDefault();
+ quickMark('not_ok');
+ } else if (event.key.toLowerCase() === 'u') {
+ event.preventDefault();
+ quickMark('uncertain');
+ } else if (event.key === 'ArrowRight' || event.key.toLowerCase() === 'j') {
+ event.preventDefault();
+ go(1);
+ } else if (event.key === 'ArrowLeft' || event.key.toLowerCase() === 'k') {
+ event.preventDefault();
+ go(-1);
+ } else if (event.key === '+' || event.key === '=') {
+ event.preventDefault();
+ adjustZoom(0.15);
+ } else if (event.key === '-') {
+ event.preventDefault();
+ adjustZoom(-0.15);
+ } else if (event.key === '0') {
+ event.preventDefault();
+ focusCurrentItem({ resetZoom: true });
+ } else if (event.key.toLowerCase() === 'f') {
+ event.preventDefault();
+ focusCurrentItem({ resetZoom: true });
+ }
+ });
+}
+
+async function init() {
+ setupEvents();
+ try {
+ const progress = await loadProgress();
+ const startIndex = progress.next_unreviewed_index ?? 0;
+ if (progress.item_count > 0) {
+ await loadItem(startIndex);
+ } else {
+ statusMessage('Session has no queued items', 'error');
+ }
+ } catch (error) {
+ statusMessage(`Startup failed: ${error.message}`, 'error');
+ }
+}
+
+init();
\ No newline at end of file
diff --git a/annotation_OCR/static/style.css b/annotation_OCR/static/style.css
new file mode 100644
index 0000000..24f39fe
--- /dev/null
+++ b/annotation_OCR/static/style.css
@@ -0,0 +1,522 @@
+:root {
+ --bg: #edf1f2;
+ --panel: #fbfcfa;
+ --panel-2: #f5f7f4;
+ --ink: #1d2528;
+ --muted: #5b686d;
+ --line: #cdd7d8;
+ --teal: #08746f;
+ --teal-dark: #075854;
+ --red: #aa3d2d;
+ --amber: #a06010;
+ --green: #2d7434;
+ --shadow: 0 18px 45px rgba(31, 45, 49, 0.14);
+ --mono: "JetBrains Mono", "IBM Plex Mono", "Cascadia Mono", monospace;
+ --sans: "Aptos", "Source Sans 3", "Segoe UI", sans-serif;
+}
+
+* {
+ box-sizing: border-box;
+}
+
+body {
+ margin: 0;
+ min-height: 100vh;
+ background:
+ linear-gradient(135deg, rgba(8, 116, 111, 0.09), transparent 34%),
+ linear-gradient(315deg, rgba(170, 61, 45, 0.08), transparent 36%),
+ var(--bg);
+ color: var(--ink);
+ font-family: var(--sans);
+}
+
+button,
+select,
+textarea {
+ font: inherit;
+}
+
+button,
+.secondary-link {
+ border: 1px solid var(--line);
+ background: var(--panel);
+ color: var(--ink);
+ min-height: 36px;
+ padding: 0 12px;
+ border-radius: 6px;
+ cursor: pointer;
+ text-decoration: none;
+ display: inline-flex;
+ align-items: center;
+ justify-content: center;
+ white-space: nowrap;
+}
+
+button:hover,
+.secondary-link:hover {
+ border-color: var(--teal);
+}
+
+.topbar {
+ position: sticky;
+ top: 0;
+ z-index: 20;
+ display: grid;
+ grid-template-columns: minmax(280px, 1fr) minmax(260px, 420px) auto;
+ gap: 18px;
+ align-items: center;
+ padding: 14px 18px;
+ background: rgba(251, 252, 250, 0.94);
+ border-bottom: 1px solid var(--line);
+ backdrop-filter: blur(14px);
+}
+
+.eyebrow,
+.section-label {
+ color: var(--muted);
+ font-size: 11px;
+ font-weight: 700;
+ letter-spacing: 0;
+ text-transform: uppercase;
+}
+
+.session-title {
+ font-size: 18px;
+ font-weight: 800;
+}
+
+.session-meta,
+.pane-subtitle,
+.save-status {
+ color: var(--muted);
+ font-size: 12px;
+}
+
+.pane-subtitle {
+ overflow: hidden;
+ text-overflow: ellipsis;
+ white-space: nowrap;
+}
+
+.progress-block {
+ display: grid;
+ gap: 7px;
+}
+
+.progress-track {
+ width: 100%;
+ height: 8px;
+ overflow: hidden;
+ background: #dce3e4;
+ border-radius: 999px;
+}
+
+.progress-track div {
+ width: 0%;
+ height: 100%;
+ background: linear-gradient(90deg, var(--teal), #6a8d28);
+ transition: width 160ms ease;
+}
+
+.nav-actions,
+.zoom-actions,
+.panel-actions {
+ display: flex;
+ gap: 8px;
+ align-items: center;
+ flex: 0 0 auto;
+}
+
+.icon-button {
+ width: 36px;
+ padding: 0;
+ font-weight: 800;
+}
+
+.workspace {
+ display: grid;
+ grid-template-columns: minmax(280px, 1.05fr) minmax(280px, 1fr) minmax(280px, 340px);
+ gap: 14px;
+ padding: 14px;
+ height: calc(100vh - 82px);
+}
+
+.pane,
+.annotation-panel {
+ min-width: 0;
+ min-height: 0;
+ background: var(--panel);
+ border: 1px solid var(--line);
+ border-radius: 8px;
+ box-shadow: var(--shadow);
+}
+
+.pane {
+ display: grid;
+ grid-template-rows: auto minmax(0, 1fr);
+ overflow: hidden;
+}
+
+.pane-toolbar {
+ display: flex;
+ justify-content: space-between;
+ gap: 12px;
+ align-items: center;
+ min-width: 0;
+ overflow: hidden;
+ padding: 12px;
+ border-bottom: 1px solid var(--line);
+ background: var(--panel-2);
+}
+
+.pane-toolbar>div:first-child {
+ flex: 1 1 auto;
+ min-width: 0;
+}
+
+.zoom-actions {
+ margin-left: auto;
+}
+
+.preview-actions {
+ display: flex;
+ flex: 0 0 auto;
+ align-items: center;
+ gap: 10px;
+}
+
+.toggle-control {
+ display: inline-flex;
+ align-items: center;
+ gap: 6px;
+ color: var(--muted);
+ font-size: 12px;
+ white-space: nowrap;
+}
+
+.zoom-actions button {
+ width: 36px;
+ padding: 0;
+}
+
+.zoom-actions #zoomResetButton {
+ width: 58px;
+}
+
+.pane-title {
+ font-size: 15px;
+ font-weight: 800;
+}
+
+.image-stage {
+ position: relative;
+ overflow: auto;
+ display: block;
+ background:
+ linear-gradient(45deg, #dce3e4 25%, transparent 25%),
+ linear-gradient(-45deg, #dce3e4 25%, transparent 25%),
+ linear-gradient(45deg, transparent 75%, #dce3e4 75%),
+ linear-gradient(-45deg, transparent 75%, #dce3e4 75%);
+ background-size: 22px 22px;
+ background-position: 0 0, 0 11px, 11px -11px, -11px 0;
+}
+
+.image-canvas {
+ --image-width: 320px;
+ --image-height: 453px;
+ --image-left: 16px;
+ --image-top: 16px;
+ --canvas-width: 100%;
+ --canvas-height: 100%;
+ position: relative;
+ width: var(--canvas-width);
+ min-width: var(--canvas-width);
+ height: var(--canvas-height);
+ min-height: var(--canvas-height);
+}
+
+.image-overlay {
+ position: absolute;
+ inset: 0;
+ pointer-events: none;
+}
+
+.focus-box {
+ position: absolute;
+ border: 1px solid rgba(204, 20, 20, 0.95);
+ border-radius: 4px;
+ background: rgba(204, 20, 20, 0.06);
+ box-shadow:
+ 0 0 0 1px rgba(255, 255, 255, 0.8) inset,
+ 0 0 0 1px rgba(204, 20, 20, 0.35);
+}
+
+#rawImage {
+ position: absolute;
+ left: var(--image-left);
+ top: var(--image-top);
+ width: var(--image-width);
+ height: var(--image-height);
+ max-width: none;
+ border: 1px solid #b7c3c5;
+ background: white;
+ box-shadow: 0 12px 28px rgba(31, 45, 49, 0.18);
+}
+
+.missing-state {
+ margin: 40px auto;
+ padding: 20px;
+ border: 1px dashed var(--red);
+ background: #fff7f3;
+ color: var(--red);
+ border-radius: 8px;
+}
+
+.markdown-preview,
+.raw-markdown {
+ overflow: auto;
+ margin: 0;
+ padding: 18px;
+}
+
+.markdown-preview {
+ line-height: 1.48;
+}
+
+.markdown-preview h1,
+.markdown-preview h2,
+.markdown-preview h3 {
+ margin: 1.2em 0 0.45em;
+ line-height: 1.15;
+}
+
+.markdown-preview table {
+ width: max-content;
+ max-width: 100%;
+ border-collapse: collapse;
+ margin: 14px 0;
+ font-size: 13px;
+}
+
+.markdown-preview th,
+.markdown-preview td {
+ border: 1px solid #b9c4c6;
+ padding: 6px 8px;
+ vertical-align: top;
+}
+
+.markdown-preview th {
+ background: #e3eceb;
+}
+
+.markdown-preview img {
+ max-width: 100%;
+ height: auto;
+ border: 1px solid var(--line);
+}
+
+.raw-markdown {
+ font-family: var(--mono);
+ font-size: 12px;
+ line-height: 1.45;
+ white-space: pre-wrap;
+ background: #172225;
+ color: #e7eeed;
+}
+
+.annotation-panel {
+ display: flex;
+ flex-direction: column;
+ overflow: auto;
+ padding: 12px;
+ gap: 12px;
+}
+
+.panel-section {
+ display: grid;
+ gap: 10px;
+ padding-bottom: 12px;
+ border-bottom: 1px solid var(--line);
+}
+
+.report-card h1 {
+ margin: 0;
+ font-size: 18px;
+ line-height: 1.2;
+}
+
+dl {
+ display: grid;
+ gap: 7px;
+ margin: 0;
+}
+
+dl div {
+ display: grid;
+ grid-template-columns: 78px minmax(0, 1fr);
+ gap: 8px;
+}
+
+dt {
+ color: var(--muted);
+ font-size: 12px;
+}
+
+dd {
+ margin: 0;
+ min-width: 0;
+ overflow-wrap: anywhere;
+ font-size: 12px;
+}
+
+.decision-buttons {
+ display: grid;
+ grid-template-columns: repeat(3, 1fr);
+ gap: 8px;
+}
+
+.status-button[data-status="ok"].active {
+ background: var(--green);
+ border-color: var(--green);
+ color: white;
+}
+
+.status-button[data-status="not_ok"].active {
+ background: var(--red);
+ border-color: var(--red);
+ color: white;
+}
+
+.status-button[data-status="uncertain"].active {
+ background: var(--amber);
+ border-color: var(--amber);
+ color: white;
+}
+
+.subchecks-section label {
+ display: grid;
+ grid-template-columns: 1fr 140px;
+ gap: 8px;
+ align-items: center;
+ font-size: 13px;
+}
+
+select,
+textarea {
+ width: 100%;
+ border: 1px solid var(--line);
+ border-radius: 6px;
+ background: white;
+ color: var(--ink);
+}
+
+select {
+ min-height: 34px;
+}
+
+textarea {
+ resize: vertical;
+ padding: 8px;
+}
+
+.issue-grid {
+ display: grid;
+ grid-template-columns: 1fr 1fr;
+ gap: 8px;
+}
+
+.issue-grid label {
+ display: flex;
+ align-items: center;
+ gap: 6px;
+ font-size: 12px;
+}
+
+.primary-button {
+ background: var(--teal);
+ border-color: var(--teal);
+ color: white;
+ font-weight: 800;
+ flex: 1;
+}
+
+.primary-button:hover {
+ background: var(--teal-dark);
+ border-color: var(--teal-dark);
+}
+
+.save-status {
+ min-height: 20px;
+}
+
+.help-dialog {
+ width: min(520px, calc(100vw - 32px));
+ border: 1px solid var(--line);
+ border-radius: 8px;
+ box-shadow: var(--shadow);
+}
+
+.dialog-header {
+ display: flex;
+ align-items: center;
+ justify-content: space-between;
+ gap: 12px;
+}
+
+.dialog-header h2 {
+ margin: 0 0 12px;
+}
+
+.shortcut-grid {
+ display: grid;
+ grid-template-columns: 90px minmax(0, 1fr);
+ gap: 8px 14px;
+}
+
+.shortcut-grid span {
+ font-family: var(--mono);
+ font-weight: 800;
+}
+
+.shortcut-grid p {
+ margin: 0;
+}
+
+@media (max-width: 1180px) {
+ .topbar {
+ grid-template-columns: 1fr;
+ }
+
+ .workspace {
+ height: auto;
+ min-height: calc(100vh - 82px);
+ grid-template-columns: 1fr;
+ }
+
+ .pane {
+ min-height: 72vh;
+ }
+
+ .annotation-panel {
+ min-height: 0;
+ }
+}
+
+@media (max-width: 620px) {
+
+ .nav-actions,
+ .pane-toolbar,
+ .panel-actions {
+ flex-wrap: wrap;
+ }
+
+ .decision-buttons,
+ .issue-grid {
+ grid-template-columns: 1fr;
+ }
+
+ .subchecks-section label,
+ dl div {
+ grid-template-columns: 1fr;
+ }
+}
\ No newline at end of file
diff --git a/annotation_OCR/store.py b/annotation_OCR/store.py
new file mode 100644
index 0000000..0d85a6b
--- /dev/null
+++ b/annotation_OCR/store.py
@@ -0,0 +1,369 @@
+"""File-backed session storage for OCR annotation runs."""
+
+from __future__ import annotations
+
+import csv
+import json
+import re
+import uuid
+from collections import Counter
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+
+HERE = Path(__file__).resolve().parent
+SESSIONS_DIR = HERE / "sessions"
+SCHEMA_VERSION = "1.0"
+
+VALID_OVERALL_STATUS = {"ok", "not_ok", "uncertain", "unreviewed"}
+
+SUMMARY_FIELDS = [
+ "session_id",
+ "session_name",
+ "annotator",
+ "item_id",
+ "item_kind",
+ "industry_slug",
+ "report_name",
+ "exchange",
+ "ticker",
+ "year",
+ "page_index",
+ "page_number",
+ "table_index",
+ "table_row_count",
+ "table_col_count",
+ "overall_status",
+ "notes",
+ "updated_at_utc",
+ "annotation_source",
+ "review_duration_ms",
+ "mapping_status",
+ "mapping_warnings",
+ "candidate_reasons",
+ "page_text_sha256",
+ "raw_png_path",
+ "mmd_path",
+ "det_mmd_path",
+ "focus_bbox",
+]
+
+
+def utc_now() -> str:
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
+
+
+def session_slug(value: str) -> str:
+ slug = re.sub(r"[^A-Za-z0-9_.-]+", "-", value.strip()).strip("-._")
+ return slug[:48] or "session"
+
+
+def new_session_id(session_name: str | None = None) -> str:
+ prefix = session_slug(session_name or "session")[:24]
+ return f"{prefix}-{uuid.uuid4().hex[:12]}"
+
+
+def atomic_write_text(path: Path, text: str) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ tmp = path.with_suffix(path.suffix + ".tmp")
+ tmp.write_text(text, encoding="utf-8")
+ tmp.replace(path)
+
+
+def atomic_write_json(path: Path, payload: Any) -> None:
+ atomic_write_text(path, json.dumps(payload, indent=2, ensure_ascii=False))
+
+
+def session_dir(session_id: str) -> Path:
+ return SESSIONS_DIR / session_id
+
+
+def metadata_path(session_id: str) -> Path:
+ return session_dir(session_id) / "metadata.json"
+
+
+def manifest_path(session_id: str) -> Path:
+ return session_dir(session_id) / "manifest.json"
+
+
+def current_annotations_path(session_id: str) -> Path:
+ return session_dir(session_id) / "current_annotations.json"
+
+
+def annotations_log_path(session_id: str) -> Path:
+ return session_dir(session_id) / "annotations.jsonl"
+
+
+def create_session(
+ *,
+ session_name: str,
+ annotator: str,
+ manifest_items: list[dict[str, Any]],
+ index_summary: dict[str, Any],
+ config: dict[str, Any],
+ session_id: str | None = None,
+) -> dict[str, Any]:
+ sid = session_id or new_session_id(session_name)
+ directory = session_dir(sid)
+ if directory.exists():
+ raise FileExistsError(f"session already exists: {sid}")
+ directory.mkdir(parents=True, exist_ok=False)
+
+ now = utc_now()
+ metadata = {
+ "schema_version": SCHEMA_VERSION,
+ "session_id": sid,
+ "session_name": session_name,
+ "annotator": annotator,
+ "created_at_utc": now,
+ "updated_at_utc": now,
+ "status": "active",
+ "item_count": len(manifest_items),
+ "completed_count": 0,
+ "index_summary": index_summary,
+ "config": config,
+ }
+ manifest = {
+ "schema_version": SCHEMA_VERSION,
+ "session_id": sid,
+ "created_at_utc": now,
+ "item_count": len(manifest_items),
+ "items": manifest_items,
+ }
+
+ atomic_write_json(metadata_path(sid), metadata)
+ atomic_write_json(manifest_path(sid), manifest)
+ atomic_write_json(current_annotations_path(sid), {})
+ annotations_log_path(sid).touch()
+ write_summary_files(sid)
+ return metadata
+
+
+def load_json(path: Path, default: Any | None = None) -> Any:
+ if not path.is_file():
+ return default
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def load_metadata(session_id: str) -> dict[str, Any]:
+ metadata = load_json(metadata_path(session_id))
+ if metadata is None:
+ raise FileNotFoundError(f"unknown session: {session_id}")
+ return metadata
+
+
+def load_manifest(session_id: str) -> list[dict[str, Any]]:
+ manifest = load_json(manifest_path(session_id))
+ if manifest is None:
+ raise FileNotFoundError(f"unknown session manifest: {session_id}")
+ return manifest.get("items", [])
+
+
+def load_current_annotations(session_id: str) -> dict[str, dict[str, Any]]:
+ return load_json(current_annotations_path(session_id), default={}) or {}
+
+
+def list_sessions() -> list[dict[str, Any]]:
+ if not SESSIONS_DIR.is_dir():
+ return []
+ sessions: list[dict[str, Any]] = []
+ for path in sorted(SESSIONS_DIR.iterdir()):
+ if not path.is_dir():
+ continue
+ metadata = load_json(path / "metadata.json")
+ if isinstance(metadata, dict):
+ sessions.append(metadata)
+ sessions.sort(key=lambda rec: rec.get("updated_at_utc", ""), reverse=True)
+ return sessions
+
+
+def manifest_index(session_id: str) -> dict[str, dict[str, Any]]:
+ return {item["item_id"]: item for item in load_manifest(session_id)}
+
+
+def sanitize_status(value: Any, valid: set[str], default: str) -> str:
+ if isinstance(value, str) and value in valid:
+ return value
+ return default
+
+
+def normalize_annotation_payload(payload: dict[str, Any]) -> dict[str, Any]:
+ return {
+ "overall_status": sanitize_status(
+ payload.get("overall_status"), VALID_OVERALL_STATUS, "unreviewed"
+ ),
+ "notes": str(payload.get("notes") or "").strip(),
+ "annotation_source": str(payload.get("annotation_source") or "manual"),
+ "review_duration_ms": payload.get("review_duration_ms"),
+ "client_started_at_utc": payload.get("client_started_at_utc"),
+ "client_updated_at_utc": payload.get("client_updated_at_utc"),
+ }
+
+
+def next_log_sequence(path: Path) -> int:
+ if not path.is_file():
+ return 1
+ with path.open(encoding="utf-8") as handle:
+ return sum(1 for line in handle if line.strip()) + 1
+
+
+def save_annotation(
+ *,
+ session_id: str,
+ item_id: str,
+ payload: dict[str, Any],
+) -> dict[str, Any]:
+ metadata = load_metadata(session_id)
+ items = manifest_index(session_id)
+ item = items.get(item_id)
+ if item is None:
+ raise KeyError(f"item not in session manifest: {item_id}")
+
+ normalized = normalize_annotation_payload(payload)
+ now = utc_now()
+ log_path = annotations_log_path(session_id)
+ record = {
+ "schema_version": SCHEMA_VERSION,
+ "sequence": next_log_sequence(log_path),
+ "session_id": session_id,
+ "session_name": metadata.get("session_name"),
+ "annotator": metadata.get("annotator"),
+ "created_at_utc": now,
+ "updated_at_utc": now,
+ "item_id": item_id,
+ "item_kind": item.get("item_kind", "page"),
+ "industry_slug": item.get("industry_slug"),
+ "report_name": item.get("report_name"),
+ "exchange": item.get("exchange"),
+ "ticker": item.get("ticker"),
+ "year": item.get("year"),
+ "page_index": item.get("page_index"),
+ "page_number": item.get("page_number"),
+ "table_index": item.get("table_index"),
+ "table_row_count": item.get("table_row_count"),
+ "table_col_count": item.get("table_col_count"),
+ "mmd_path": item.get("mmd_path"),
+ "det_mmd_path": item.get("det_mmd_path"),
+ "raw_png_path": item.get("raw_png_path"),
+ "focus_bbox": item.get("focus_bbox"),
+ "mapping_status": item.get("mapping_status"),
+ "mapping_warnings": item.get("mapping_warnings", []),
+ "candidate_reasons": item.get("candidate_reasons", []),
+ "page_text_sha256": item.get("page_text_sha256"),
+ **normalized,
+ }
+
+ with log_path.open("a", encoding="utf-8") as handle:
+ handle.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+ current = load_current_annotations(session_id)
+ current[item_id] = record
+ atomic_write_json(current_annotations_path(session_id), current)
+
+ completed_count = sum(
+ 1 for rec in current.values() if rec.get("overall_status") != "unreviewed"
+ )
+ metadata["updated_at_utc"] = now
+ metadata["completed_count"] = completed_count
+ metadata["item_count"] = len(items)
+ atomic_write_json(metadata_path(session_id), metadata)
+ write_summary_files(session_id)
+ return record
+
+
+def summary_rows(session_id: str) -> list[dict[str, Any]]:
+ metadata = load_metadata(session_id)
+ current = load_current_annotations(session_id)
+ rows: list[dict[str, Any]] = []
+ for item in load_manifest(session_id):
+ annotation = current.get(item["item_id"], {})
+ rows.append(
+ {
+ "session_id": session_id,
+ "session_name": metadata.get("session_name", ""),
+ "annotator": metadata.get("annotator", ""),
+ "item_id": item.get("item_id"),
+ "item_kind": item.get("item_kind", "page"),
+ "industry_slug": item.get("industry_slug"),
+ "report_name": item.get("report_name"),
+ "exchange": item.get("exchange"),
+ "ticker": item.get("ticker"),
+ "year": item.get("year"),
+ "page_index": item.get("page_index"),
+ "page_number": item.get("page_number"),
+ "table_index": item.get("table_index"),
+ "table_row_count": item.get("table_row_count"),
+ "table_col_count": item.get("table_col_count"),
+ "overall_status": annotation.get("overall_status", "unreviewed"),
+ "notes": annotation.get("notes", ""),
+ "updated_at_utc": annotation.get("updated_at_utc", ""),
+ "annotation_source": annotation.get("annotation_source", ""),
+ "review_duration_ms": annotation.get("review_duration_ms", ""),
+ "mapping_status": item.get("mapping_status"),
+ "mapping_warnings": ";".join(item.get("mapping_warnings", [])),
+ "candidate_reasons": ";".join(item.get("candidate_reasons", [])),
+ "page_text_sha256": item.get("page_text_sha256"),
+ "raw_png_path": item.get("raw_png_path"),
+ "mmd_path": item.get("mmd_path"),
+ "det_mmd_path": item.get("det_mmd_path"),
+ "focus_bbox": json.dumps(item.get("focus_bbox")),
+ }
+ )
+ return rows
+
+
+def write_summary_csv(path: Path, rows: list[dict[str, Any]]) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ tmp = path.with_suffix(path.suffix + ".tmp")
+ with tmp.open("w", newline="", encoding="utf-8") as handle:
+ writer = csv.DictWriter(
+ handle, fieldnames=SUMMARY_FIELDS, extrasaction="ignore"
+ )
+ writer.writeheader()
+ writer.writerows(rows)
+ tmp.replace(path)
+
+
+def write_summary_md(path: Path, rows: list[dict[str, Any]]) -> None:
+ metadata = load_metadata(path.parent.name)
+ status_counts = Counter(row["overall_status"] for row in rows)
+
+ reviewed = len(rows) - status_counts.get("unreviewed", 0)
+ lines = [
+ f"# OCR Annotation Summary: {metadata.get('session_name', path.parent.name)}",
+ "",
+ f"- Session ID: `{path.parent.name}`",
+ f"- Annotator: `{metadata.get('annotator', '')}`",
+ f"- Items: {len(rows)}",
+ f"- Reviewed: {reviewed}",
+ f"- Updated: {metadata.get('updated_at_utc', '')}",
+ "",
+ "## Status Counts",
+ "",
+ "| Status | Count |",
+ "| --- | ---: |",
+ ]
+ for status, count in sorted(status_counts.items()):
+ lines.append(f"| {status} | {count} |")
+
+ atomic_write_text(path, "\n".join(lines) + "\n")
+
+
+def write_summary_files(session_id: str) -> dict[str, str]:
+ rows = summary_rows(session_id)
+ directory = session_dir(session_id)
+ csv_path = directory / "summary.csv"
+ md_path = directory / "summary.md"
+ write_summary_csv(csv_path, rows)
+ write_summary_md(md_path, rows)
+ return {"summary_csv": str(csv_path), "summary_md": str(md_path)}
+
+
+def write_all_sessions_summary(path: Path | None = None) -> Path:
+ out_path = path or (SESSIONS_DIR / "all_sessions_summary.csv")
+ rows: list[dict[str, Any]] = []
+ for metadata in list_sessions():
+ rows.extend(summary_rows(metadata["session_id"]))
+ write_summary_csv(out_path, rows)
+ return out_path
diff --git a/annotation_OCR/study_agreement.py b/annotation_OCR/study_agreement.py
new file mode 100644
index 0000000..0e73762
--- /dev/null
+++ b/annotation_OCR/study_agreement.py
@@ -0,0 +1,787 @@
+"""Compute agreement and accept/reject ratios for bundle-backed table studies."""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from collections import Counter
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from math import comb
+from pathlib import Path
+from typing import Any
+
+
+HERE = Path(__file__).resolve().parent
+DEFAULT_SESSIONS_DIR = HERE / "sessions"
+DEFAULT_ANALYSIS_ROOT = DEFAULT_SESSIONS_DIR / "study_analysis"
+REVIEWED_STATUSES = ("ok", "not_ok", "uncertain")
+VALID_STATUSES = set(REVIEWED_STATUSES).union({"unreviewed"})
+
+
+@dataclass(slots=True)
+class SessionPayload:
+ session_id: str
+ session_name: str
+ annotator: str
+ slot: int
+ item_count: int
+ completed_count: int
+ updated_at_utc: str
+ metadata: dict[str, Any]
+ manifest_items: list[dict[str, Any]]
+ current_annotations: dict[str, dict[str, Any]]
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="Compute agreement metrics and accept/reject ratios for OCR table study sessions."
+ )
+ parser.add_argument(
+ "--study-bundle",
+ type=Path,
+ required=True,
+ help="Path to the study_sessions_*.json bundle used for the annotation round.",
+ )
+ parser.add_argument(
+ "--sessions-dir",
+ type=Path,
+ default=DEFAULT_SESSIONS_DIR,
+ help="Directory containing annotation_OCR session folders.",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=Path,
+ default=None,
+ help="Output directory for summary artifacts. Defaults to sessions/study_analysis//.",
+ )
+ parser.add_argument(
+ "--session-id",
+ dest="session_ids",
+ nargs="+",
+ default=None,
+ help="Optional explicit session ids to analyze. If omitted, all sessions linked to the study bundle are used.",
+ )
+ parser.add_argument(
+ "--strict-manifest",
+ action="store_true",
+ help="Fail if a selected session manifest does not match its bundle slot.",
+ )
+ return parser
+
+
+def utc_now() -> str:
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
+
+
+def load_json(path: Path, *, default: Any | None = None) -> Any:
+ if not path.is_file():
+ return default
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def atomic_write_text(path: Path, text: str) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ tmp = path.with_suffix(path.suffix + ".tmp")
+ tmp.write_text(text, encoding="utf-8")
+ tmp.replace(path)
+
+
+def atomic_write_json(path: Path, payload: Any) -> None:
+ atomic_write_text(path, json.dumps(payload, indent=2, ensure_ascii=False) + "\n")
+
+
+def safe_div(numerator: int, denominator: int) -> float | None:
+ if denominator == 0:
+ return None
+ return numerator / denominator
+
+
+def format_ratio(value: float | None) -> str:
+ if value is None:
+ return "n/a"
+ return f"{value * 100:.1f}%"
+
+
+def parse_int(value: Any) -> int | None:
+ if isinstance(value, int):
+ return value
+ if isinstance(value, str) and value.isdigit():
+ return int(value)
+ return None
+
+
+def normalize_status(value: Any) -> str:
+ if isinstance(value, str) and value in VALID_STATUSES:
+ return value
+ return "unreviewed"
+
+
+def load_study_bundle(path: Path) -> dict[str, Any]:
+ payload = load_json(path)
+ sessions = payload.get("sessions") if isinstance(payload, dict) else None
+ if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance(
+ sessions, list
+ ):
+ raise ValueError(f"invalid study bundle in {path}")
+ return payload
+
+
+def build_bundle_index(
+ bundle: dict[str, Any],
+) -> tuple[dict[int, dict[str, Any]], dict[str, dict[str, Any]], list[str]]:
+ slot_index: dict[int, dict[str, Any]] = {}
+ item_index: dict[str, dict[str, Any]] = {}
+ warnings: list[str] = []
+
+ for session in bundle["sessions"]:
+ slot = parse_int(session.get("slot"))
+ items = session.get("items")
+ if slot is None or not isinstance(items, list):
+ raise ValueError("invalid study session entry in bundle")
+ slot_index[slot] = session
+ for item in items:
+ item_id = str(item.get("item_id") or "")
+ if not item_id:
+ raise ValueError(f"bundle slot {slot} contains an item without item_id")
+ expected_votes = parse_int(item.get("study_expected_votes")) or 1
+ study_assignment = str(item.get("study_assignment") or "single")
+ record = item_index.setdefault(
+ item_id,
+ {
+ "item_id": item_id,
+ "industry_slug": item.get("industry_slug"),
+ "report_name": item.get("report_name"),
+ "exchange": item.get("exchange"),
+ "ticker": item.get("ticker"),
+ "year": item.get("year"),
+ "page_index": item.get("page_index"),
+ "page_number": item.get("page_number"),
+ "table_index": item.get("table_index"),
+ "table_row_count": item.get("table_row_count"),
+ "table_col_count": item.get("table_col_count"),
+ "study_assignment": study_assignment,
+ "expected_votes": expected_votes,
+ "assigned_slots": [],
+ },
+ )
+ record["assigned_slots"].append(slot)
+ record["expected_votes"] = max(record["expected_votes"], expected_votes)
+ if study_assignment == "agreement":
+ record["study_assignment"] = "agreement"
+
+ for item_id, record in item_index.items():
+ assigned_slots = sorted(record["assigned_slots"])
+ record["assigned_slots"] = assigned_slots
+ occurrence_count = len(assigned_slots)
+ if occurrence_count > 1:
+ record["study_assignment"] = "agreement"
+ if occurrence_count != record["expected_votes"]:
+ warnings.append(
+ f"bundle item {item_id} appears in {occurrence_count} slots but declares study_expected_votes={record['expected_votes']}"
+ )
+
+ return slot_index, item_index, warnings
+
+
+def load_session_payload(sessions_dir: Path, session_id: str) -> SessionPayload:
+ directory = sessions_dir / session_id
+ metadata = load_json(directory / "metadata.json")
+ if not isinstance(metadata, dict):
+ raise FileNotFoundError(f"missing metadata for session {session_id}")
+ manifest = load_json(directory / "manifest.json", default={}) or {}
+ manifest_items = manifest.get("items") if isinstance(manifest, dict) else None
+ if not isinstance(manifest_items, list):
+ raise ValueError(f"invalid manifest for session {session_id}")
+ current_annotations = (
+ load_json(directory / "current_annotations.json", default={}) or {}
+ )
+ if not isinstance(current_annotations, dict):
+ raise ValueError(f"invalid current_annotations for session {session_id}")
+
+ config = metadata.get("config") or {}
+ slot = parse_int(config.get("study_slot"))
+ if slot is None:
+ raise ValueError(f"session {session_id} has no usable study_slot")
+
+ return SessionPayload(
+ session_id=session_id,
+ session_name=str(metadata.get("session_name") or session_id),
+ annotator=str(metadata.get("annotator") or ""),
+ slot=slot,
+ item_count=parse_int(metadata.get("item_count")) or len(manifest_items),
+ completed_count=parse_int(metadata.get("completed_count")) or 0,
+ updated_at_utc=str(metadata.get("updated_at_utc") or ""),
+ metadata=metadata,
+ manifest_items=manifest_items,
+ current_annotations=current_annotations,
+ )
+
+
+def discover_sessions(
+ *,
+ sessions_dir: Path,
+ bundle_path: Path,
+ session_ids: list[str] | None,
+) -> tuple[dict[int, SessionPayload], list[str]]:
+ bundle_resolved = str(bundle_path.resolve())
+ warnings: list[str] = []
+ discovered: dict[int, SessionPayload] = {}
+
+ if session_ids is None:
+ candidate_ids = [
+ path.name for path in sorted(sessions_dir.iterdir()) if path.is_dir()
+ ]
+ else:
+ candidate_ids = session_ids
+
+ for session_id in candidate_ids:
+ metadata = load_json(sessions_dir / session_id / "metadata.json")
+ if not isinstance(metadata, dict):
+ if session_ids is None:
+ continue
+ raise FileNotFoundError(f"missing metadata for session {session_id}")
+
+ config = metadata.get("config") or {}
+ session_bundle = config.get("study_bundle_path")
+ session_slot = parse_int(config.get("study_slot"))
+ if session_ids is None:
+ if session_bundle != bundle_resolved or session_slot is None:
+ continue
+
+ payload = load_session_payload(sessions_dir, session_id)
+ if session_ids is not None and session_bundle not in {None, bundle_resolved}:
+ warnings.append(
+ f"session {session_id} references a different study bundle: {session_bundle}"
+ )
+
+ existing = discovered.get(payload.slot)
+ if existing is None:
+ discovered[payload.slot] = payload
+ continue
+
+ keep = payload
+ drop = existing
+ if (existing.updated_at_utc, existing.session_id) > (
+ payload.updated_at_utc,
+ payload.session_id,
+ ):
+ keep = existing
+ drop = payload
+ discovered[payload.slot] = keep
+ warnings.append(
+ f"multiple sessions claim study slot {payload.slot}; keeping {keep.session_id} and ignoring {drop.session_id}"
+ )
+
+ return discovered, warnings
+
+
+def validate_session_manifest(
+ *,
+ payload: SessionPayload,
+ expected_session: dict[str, Any],
+ strict: bool,
+) -> list[str]:
+ warnings: list[str] = []
+ actual_ids = [str(item.get("item_id") or "") for item in payload.manifest_items]
+ expected_ids = [
+ str(item.get("item_id") or "") for item in expected_session["items"]
+ ]
+ if Counter(actual_ids) != Counter(expected_ids):
+ message = f"session {payload.session_id} manifest does not match bundle slot {payload.slot}"
+ if strict:
+ raise ValueError(message)
+ warnings.append(message)
+ return warnings
+
+
+def status_ratio_block(counts: Counter[str]) -> dict[str, Any]:
+ reviewed = sum(counts.get(status, 0) for status in REVIEWED_STATUSES)
+ decided = counts.get("ok", 0) + counts.get("not_ok", 0)
+ return {
+ "reviewed": reviewed,
+ "decided": decided,
+ "ok": counts.get("ok", 0),
+ "not_ok": counts.get("not_ok", 0),
+ "uncertain": counts.get("uncertain", 0),
+ "ok_rate_all": safe_div(counts.get("ok", 0), reviewed),
+ "not_ok_rate_all": safe_div(counts.get("not_ok", 0), reviewed),
+ "uncertain_rate_all": safe_div(counts.get("uncertain", 0), reviewed),
+ "accept_ratio_decided": safe_div(counts.get("ok", 0), decided),
+ "reject_ratio_decided": safe_div(counts.get("not_ok", 0), decided),
+ }
+
+
+def majority_status(counts: Counter[str], vote_count: int) -> str | None:
+ if vote_count == 0:
+ return None
+ top_count = max(counts.values(), default=0)
+ if top_count * 2 <= vote_count:
+ return None
+ winners = [status for status, count in counts.items() if count == top_count]
+ if len(winners) != 1:
+ return None
+ return winners[0]
+
+
+def compute_pairwise_agreement(item_rows: list[dict[str, Any]]) -> dict[str, Any]:
+ items_considered = 0
+ matching_pairs = 0
+ total_pairs = 0
+ for row in item_rows:
+ vote_count = int(row["vote_count"])
+ if vote_count < 2:
+ continue
+ items_considered += 1
+ total_pairs += comb(vote_count, 2)
+ matching_pairs += sum(
+ comb(int(row[f"{status}_votes"]), 2) for status in REVIEWED_STATUSES
+ )
+ return {
+ "items_considered": items_considered,
+ "pairs_total": total_pairs,
+ "pairs_matching": matching_pairs,
+ "agreement_rate": safe_div(matching_pairs, total_pairs),
+ }
+
+
+def compute_fleiss_kappa(item_rows: list[dict[str, Any]]) -> float | None:
+ if not item_rows:
+ return None
+ n = int(item_rows[0]["expected_votes"])
+ if n < 2:
+ return None
+ if any(int(row["expected_votes"]) != n for row in item_rows):
+ return None
+
+ total_items = len(item_rows)
+ p_i_values: list[float] = []
+ category_totals = Counter[str]()
+ for row in item_rows:
+ row_total = 0
+ squared_sum = 0
+ for status in REVIEWED_STATUSES:
+ count = int(row[f"{status}_votes"])
+ category_totals[status] += count
+ row_total += count
+ squared_sum += count * count
+ if row_total != n:
+ return None
+ p_i_values.append((squared_sum - n) / (n * (n - 1)))
+
+ p_bar = sum(p_i_values) / total_items
+ p_e = 0.0
+ for status in REVIEWED_STATUSES:
+ p_j = category_totals[status] / (total_items * n)
+ p_e += p_j * p_j
+ if p_e == 1.0:
+ return None
+ return (p_bar - p_e) / (1.0 - p_e)
+
+
+def build_analysis(
+ *,
+ bundle_path: Path,
+ sessions_dir: Path,
+ session_ids: list[str] | None,
+ strict_manifest: bool,
+) -> tuple[dict[str, Any], list[dict[str, Any]], list[dict[str, Any]]]:
+ bundle = load_study_bundle(bundle_path)
+ slot_index, item_index, warnings = build_bundle_index(bundle)
+ selected_sessions, session_warnings = discover_sessions(
+ sessions_dir=sessions_dir,
+ bundle_path=bundle_path,
+ session_ids=session_ids,
+ )
+ warnings.extend(session_warnings)
+
+ expected_slots = sorted(slot_index)
+ missing_slots = [slot for slot in expected_slots if slot not in selected_sessions]
+
+ session_rows: list[dict[str, Any]] = []
+ for slot in expected_slots:
+ payload = selected_sessions.get(slot)
+ if payload is None:
+ continue
+ warnings.extend(
+ validate_session_manifest(
+ payload=payload,
+ expected_session=slot_index[slot],
+ strict=strict_manifest,
+ )
+ )
+ slot_status_counts = Counter[str]()
+ for item in slot_index[slot]["items"]:
+ item_id = str(item["item_id"])
+ annotation = payload.current_annotations.get(item_id) or {}
+ slot_status_counts[normalize_status(annotation.get("overall_status"))] += 1
+ reviewed_count = (
+ len(slot_index[slot]["items"]) - slot_status_counts["unreviewed"]
+ )
+ if reviewed_count != payload.completed_count:
+ warnings.append(
+ f"session {payload.session_id} metadata says completed_count={payload.completed_count} but current_annotations implies {reviewed_count}"
+ )
+ status_block = status_ratio_block(slot_status_counts)
+ session_rows.append(
+ {
+ "slot": slot,
+ "session_id": payload.session_id,
+ "session_name": payload.session_name,
+ "annotator": payload.annotator,
+ "item_count": len(slot_index[slot]["items"]),
+ "reviewed_count": reviewed_count,
+ "unreviewed_count": slot_status_counts["unreviewed"],
+ "ok": slot_status_counts["ok"],
+ "not_ok": slot_status_counts["not_ok"],
+ "uncertain": slot_status_counts["uncertain"],
+ "accept_ratio_decided": status_block["accept_ratio_decided"],
+ "reject_ratio_decided": status_block["reject_ratio_decided"],
+ "uncertain_rate_all": status_block["uncertain_rate_all"],
+ "updated_at_utc": payload.updated_at_utc,
+ }
+ )
+
+ vote_level_counts_all = Counter[str]()
+ vote_level_counts_single = Counter[str]()
+ vote_level_counts_agreement = Counter[str]()
+ item_rows: list[dict[str, Any]] = []
+
+ for item_id, record in sorted(item_index.items()):
+ votes: list[dict[str, Any]] = []
+ missing_session_slots_for_item: list[int] = []
+ unreviewed_slots: list[int] = []
+ available_slots: list[int] = []
+
+ for slot in record["assigned_slots"]:
+ payload = selected_sessions.get(slot)
+ if payload is None:
+ missing_session_slots_for_item.append(slot)
+ continue
+ available_slots.append(slot)
+ annotation = payload.current_annotations.get(item_id) or {}
+ status = normalize_status(annotation.get("overall_status"))
+ if status == "unreviewed":
+ unreviewed_slots.append(slot)
+ continue
+ vote = {
+ "slot": slot,
+ "session_id": payload.session_id,
+ "annotator": payload.annotator,
+ "overall_status": status,
+ "updated_at_utc": annotation.get("updated_at_utc", ""),
+ }
+ votes.append(vote)
+ vote_level_counts_all[status] += 1
+ if record["study_assignment"] == "agreement":
+ vote_level_counts_agreement[status] += 1
+ else:
+ vote_level_counts_single[status] += 1
+
+ vote_counts = Counter(vote["overall_status"] for vote in votes)
+ vote_count = len(votes)
+ majority = majority_status(vote_counts, vote_count)
+ is_complete = vote_count == int(record["expected_votes"])
+ is_unanimous = is_complete and len(vote_counts) == 1
+ final_status = None
+ if record["study_assignment"] == "single" and vote_count == 1:
+ final_status = votes[0]["overall_status"]
+ elif record["study_assignment"] == "agreement" and is_complete and majority:
+ final_status = majority
+
+ item_rows.append(
+ {
+ "item_id": item_id,
+ "study_assignment": record["study_assignment"],
+ "expected_votes": int(record["expected_votes"]),
+ "assigned_slots": json.dumps(record["assigned_slots"]),
+ "available_slots": json.dumps(available_slots),
+ "missing_session_slots": json.dumps(missing_session_slots_for_item),
+ "unreviewed_slots": json.dumps(unreviewed_slots),
+ "vote_count": vote_count,
+ "ok_votes": vote_counts["ok"],
+ "not_ok_votes": vote_counts["not_ok"],
+ "uncertain_votes": vote_counts["uncertain"],
+ "is_complete": is_complete,
+ "is_unanimous": is_unanimous,
+ "has_majority": majority is not None,
+ "majority_status": majority or "",
+ "final_status": final_status or "",
+ "votes_json": json.dumps(votes, ensure_ascii=False),
+ "industry_slug": record.get("industry_slug"),
+ "report_name": record.get("report_name"),
+ "exchange": record.get("exchange"),
+ "ticker": record.get("ticker"),
+ "year": record.get("year"),
+ "page_index": record.get("page_index"),
+ "page_number": record.get("page_number"),
+ "table_index": record.get("table_index"),
+ "table_row_count": record.get("table_row_count"),
+ "table_col_count": record.get("table_col_count"),
+ }
+ )
+
+ agreement_rows = [
+ row for row in item_rows if row["study_assignment"] == "agreement"
+ ]
+ complete_agreement_rows = [row for row in agreement_rows if row["is_complete"]]
+ agreement_rows_with_2plus_votes = [
+ row for row in agreement_rows if int(row["vote_count"]) >= 2
+ ]
+ unanimous_rows = [row for row in complete_agreement_rows if row["is_unanimous"]]
+ majority_rows = [
+ row
+ for row in complete_agreement_rows
+ if row["has_majority"] and not row["is_unanimous"]
+ ]
+ no_majority_rows = [
+ row for row in complete_agreement_rows if not row["has_majority"]
+ ]
+
+ final_status_counts = Counter(
+ row["final_status"] for row in item_rows if row["final_status"]
+ )
+ agreement_final_counts = Counter(
+ row["majority_status"]
+ for row in complete_agreement_rows
+ if row["majority_status"]
+ )
+
+ summary = {
+ "analysis_completed_at_utc": utc_now(),
+ "study_bundle_path": str(bundle_path.resolve()),
+ "sessions_dir": str(sessions_dir.resolve()),
+ "bundle": {
+ "annotator_count": bundle.get("annotator_count"),
+ "required_votes": bundle.get("required_votes"),
+ "summary": bundle.get("summary") or {},
+ },
+ "session_coverage": {
+ "expected_slots": expected_slots,
+ "sessions_found": len(selected_sessions),
+ "missing_slots": missing_slots,
+ },
+ "annotation_votes": {
+ "all": status_ratio_block(vote_level_counts_all),
+ "single": status_ratio_block(vote_level_counts_single),
+ "agreement": status_ratio_block(vote_level_counts_agreement),
+ },
+ "agreement": {
+ "tables_total": len(agreement_rows),
+ "tables_with_any_vote": sum(
+ 1 for row in agreement_rows if row["vote_count"] > 0
+ ),
+ "tables_with_2plus_votes": len(agreement_rows_with_2plus_votes),
+ "tables_complete": len(complete_agreement_rows),
+ "unanimous_tables": len(unanimous_rows),
+ "mixed_majority_tables": len(majority_rows),
+ "no_majority_tables": len(no_majority_rows),
+ "exact_agreement_rate": safe_div(
+ len(unanimous_rows), len(complete_agreement_rows)
+ ),
+ "complete_pairwise": compute_pairwise_agreement(complete_agreement_rows),
+ "partial_pairwise": compute_pairwise_agreement(
+ agreement_rows_with_2plus_votes
+ ),
+ "fleiss_kappa": compute_fleiss_kappa(complete_agreement_rows),
+ "majority_status_counts": dict(agreement_final_counts),
+ "majority_status_ratios": status_ratio_block(agreement_final_counts),
+ },
+ "final_table_decisions": {
+ "tables_with_final_status": sum(final_status_counts.values()),
+ "status_counts": dict(final_status_counts),
+ "status_ratios": status_ratio_block(final_status_counts),
+ },
+ "warnings": warnings,
+ }
+ return summary, session_rows, item_rows
+
+
+def write_csv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str]) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ tmp = path.with_suffix(path.suffix + ".tmp")
+ with tmp.open("w", newline="", encoding="utf-8") as handle:
+ writer = csv.DictWriter(handle, fieldnames=fieldnames, extrasaction="ignore")
+ writer.writeheader()
+ writer.writerows(rows)
+ tmp.replace(path)
+
+
+def render_summary_markdown(
+ *, summary: dict[str, Any], session_rows: list[dict[str, Any]]
+) -> str:
+ session_coverage = summary["session_coverage"]
+ all_votes = summary["annotation_votes"]["all"]
+ agreement = summary["agreement"]
+ final_tables = summary["final_table_decisions"]
+ warnings = summary.get("warnings") or []
+
+ lines = [
+ "# OCR Table Study Agreement Summary",
+ "",
+ f"- Generated: {summary['analysis_completed_at_utc']}",
+ f"- Study bundle: {summary['study_bundle_path']}",
+ f"- Sessions directory: {summary['sessions_dir']}",
+ f"- Sessions found: {session_coverage['sessions_found']} / {len(session_coverage['expected_slots'])}",
+ f"- Missing slots: {', '.join(str(slot) for slot in session_coverage['missing_slots']) or 'none'}",
+ "",
+ "## Vote-Level Ratios",
+ "",
+ f"- Reviewed votes: {all_votes['reviewed']}",
+ f"- Accept rate among all reviewed votes: {format_ratio(all_votes['ok_rate_all'])}",
+ f"- Reject rate among all reviewed votes: {format_ratio(all_votes['not_ok_rate_all'])}",
+ f"- Uncertain rate among all reviewed votes: {format_ratio(all_votes['uncertain_rate_all'])}",
+ f"- Accept ratio among decided votes: {format_ratio(all_votes['accept_ratio_decided'])}",
+ f"- Reject ratio among decided votes: {format_ratio(all_votes['reject_ratio_decided'])}",
+ "",
+ "## Agreement Subset",
+ "",
+ f"- Agreement tables total: {agreement['tables_total']}",
+ f"- Agreement tables with 2+ votes: {agreement['tables_with_2plus_votes']}",
+ f"- Agreement tables complete: {agreement['tables_complete']}",
+ f"- Exact agreement rate: {format_ratio(agreement['exact_agreement_rate'])}",
+ f"- Complete pairwise agreement: {format_ratio(agreement['complete_pairwise']['agreement_rate'])}",
+ f"- Partial pairwise agreement: {format_ratio(agreement['partial_pairwise']['agreement_rate'])}",
+ f"- Fleiss' kappa: {agreement['fleiss_kappa']:.4f}"
+ if agreement["fleiss_kappa"] is not None
+ else "- Fleiss' kappa: n/a",
+ f"- Unanimous tables: {agreement['unanimous_tables']}",
+ f"- Mixed-majority tables: {agreement['mixed_majority_tables']}",
+ f"- No-majority tables: {agreement['no_majority_tables']}",
+ "",
+ "## Final Table Decisions",
+ "",
+ f"- Tables with a final status: {final_tables['tables_with_final_status']}",
+ f"- Accept rate at table level: {format_ratio(final_tables['status_ratios']['ok_rate_all'])}",
+ f"- Reject rate at table level: {format_ratio(final_tables['status_ratios']['not_ok_rate_all'])}",
+ f"- Uncertain rate at table level: {format_ratio(final_tables['status_ratios']['uncertain_rate_all'])}",
+ f"- Accept ratio among decided tables: {format_ratio(final_tables['status_ratios']['accept_ratio_decided'])}",
+ f"- Reject ratio among decided tables: {format_ratio(final_tables['status_ratios']['reject_ratio_decided'])}",
+ "",
+ "## Session Breakdown",
+ "",
+ "| Slot | Session ID | Annotator | Reviewed | OK | Not OK | Uncertain | Accept Ratio | Reject Ratio |",
+ "| ---: | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |",
+ ]
+
+ for row in sorted(session_rows, key=lambda item: item["slot"]):
+ lines.append(
+ "| "
+ + " | ".join(
+ [
+ str(row["slot"]),
+ row["session_id"],
+ row["annotator"],
+ str(row["reviewed_count"]),
+ str(row["ok"]),
+ str(row["not_ok"]),
+ str(row["uncertain"]),
+ format_ratio(row["accept_ratio_decided"]),
+ format_ratio(row["reject_ratio_decided"]),
+ ]
+ )
+ + " |"
+ )
+
+ if warnings:
+ lines.extend(["", "## Warnings", ""])
+ for warning in warnings:
+ lines.append(f"- {warning}")
+
+ return "\n".join(lines) + "\n"
+
+
+def main(argv: list[str] | None = None) -> int:
+ args = build_arg_parser().parse_args(argv)
+ output_dir = args.output_dir or (DEFAULT_ANALYSIS_ROOT / args.study_bundle.stem)
+
+ summary, session_rows, item_rows = build_analysis(
+ bundle_path=args.study_bundle,
+ sessions_dir=args.sessions_dir,
+ session_ids=args.session_ids,
+ strict_manifest=args.strict_manifest,
+ )
+
+ atomic_write_json(output_dir / "summary.json", summary)
+ atomic_write_text(
+ output_dir / "summary.md",
+ render_summary_markdown(summary=summary, session_rows=session_rows),
+ )
+ write_csv(
+ output_dir / "session_metrics.csv",
+ session_rows,
+ fieldnames=[
+ "slot",
+ "session_id",
+ "session_name",
+ "annotator",
+ "item_count",
+ "reviewed_count",
+ "unreviewed_count",
+ "ok",
+ "not_ok",
+ "uncertain",
+ "accept_ratio_decided",
+ "reject_ratio_decided",
+ "uncertain_rate_all",
+ "updated_at_utc",
+ ],
+ )
+ write_csv(
+ output_dir / "item_metrics.csv",
+ item_rows,
+ fieldnames=[
+ "item_id",
+ "study_assignment",
+ "expected_votes",
+ "assigned_slots",
+ "available_slots",
+ "missing_session_slots",
+ "unreviewed_slots",
+ "vote_count",
+ "ok_votes",
+ "not_ok_votes",
+ "uncertain_votes",
+ "is_complete",
+ "is_unanimous",
+ "has_majority",
+ "majority_status",
+ "final_status",
+ "votes_json",
+ "industry_slug",
+ "report_name",
+ "exchange",
+ "ticker",
+ "year",
+ "page_index",
+ "page_number",
+ "table_index",
+ "table_row_count",
+ "table_col_count",
+ ],
+ )
+
+ print(f"Wrote study analysis to {output_dir}")
+ print(
+ json.dumps(
+ {
+ "sessions_found": summary["session_coverage"]["sessions_found"],
+ "agreement_tables_complete": summary["agreement"]["tables_complete"],
+ "exact_agreement_rate": summary["agreement"]["exact_agreement_rate"],
+ "final_table_accept_ratio": summary["final_table_decisions"][
+ "status_ratios"
+ ]["accept_ratio_decided"],
+ "final_table_reject_ratio": summary["final_table_decisions"][
+ "status_ratios"
+ ]["reject_ratio_decided"],
+ "warnings": len(summary.get("warnings") or []),
+ },
+ indent=2,
+ )
+ )
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/annotation_OCR/study_sessions.py b/annotation_OCR/study_sessions.py
new file mode 100644
index 0000000..731c5af
--- /dev/null
+++ b/annotation_OCR/study_sessions.py
@@ -0,0 +1,277 @@
+"""Build balanced table-study session bundles from a base table manifest."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import random
+from pathlib import Path
+from typing import Any
+
+
+HERE = Path(__file__).resolve().parent
+DEFAULT_SOURCE_MANIFEST = HERE / "manifests" / "tables_5000.json"
+DEFAULT_OUTPUT_DIR = HERE / "manifests"
+
+DEFAULT_TOTAL_TABLES = 1200
+DEFAULT_MIN_SESSION_ITEMS = 100
+DEFAULT_MAX_SESSION_ITEMS = 140
+DEFAULT_REQUIRED_VOTES = 3
+DEFAULT_OVERLAP_BY_ANNOTATORS = {
+ 13: 250,
+ 14: 300,
+ 15: 300,
+ 16: 300,
+ 17: 300,
+}
+
+
+def load_manifest_items(path: Path) -> list[dict[str, Any]]:
+ payload = json.loads(path.read_text(encoding="utf-8"))
+ items = payload.get("items")
+ if not isinstance(items, list):
+ raise ValueError(f"invalid manifest items in {path}")
+ return items
+
+
+def balanced_counts(total: int, buckets: int) -> list[int]:
+ base, remainder = divmod(total, buckets)
+ return [base + (1 if index < remainder else 0) for index in range(buckets)]
+
+
+def pick_study_tables(
+ items: list[dict[str, Any]], *, total_tables: int, seed: int
+) -> list[dict[str, Any]]:
+ if total_tables > len(items):
+ raise ValueError(
+ f"requested {total_tables} tables from manifest with only {len(items)} items"
+ )
+ rng = random.Random(seed)
+ selected = rng.sample(items, total_tables)
+ rng.shuffle(selected)
+ return selected
+
+
+def choose_overlap_sessions(
+ *, overlap_items: list[dict[str, Any]], overlap_counts: list[int], seed: int
+) -> list[list[dict[str, Any]]]:
+ rng = random.Random(seed)
+ remaining = overlap_counts[:]
+ assignments: list[list[dict[str, Any]]] = [[] for _ in overlap_counts]
+
+ for item in overlap_items:
+ eligible = [index for index, count in enumerate(remaining) if count > 0]
+ if len(eligible) < DEFAULT_REQUIRED_VOTES:
+ raise ValueError(
+ "not enough session capacity left for agreement assignment"
+ )
+ rng.shuffle(eligible)
+ eligible.sort(key=lambda index: remaining[index], reverse=True)
+ chosen = eligible[:DEFAULT_REQUIRED_VOTES]
+ for session_index in chosen:
+ assignments[session_index].append(item)
+ remaining[session_index] -= 1
+
+ if any(value != 0 for value in remaining):
+ raise ValueError("failed to exhaust overlap assignment capacities")
+
+ return assignments
+
+
+def build_session_items(
+ *,
+ selected_items: list[dict[str, Any]],
+ annotator_count: int,
+ overlap_tables: int,
+ seed: int,
+ min_session_items: int,
+ max_session_items: int,
+) -> dict[str, Any]:
+ total_tables = len(selected_items)
+ if overlap_tables > total_tables:
+ raise ValueError("overlap table count cannot exceed selected tables")
+
+ total_annotations = total_tables + (DEFAULT_REQUIRED_VOTES - 1) * overlap_tables
+ session_sizes = balanced_counts(total_annotations, annotator_count)
+ if any(
+ size < min_session_items or size > max_session_items for size in session_sizes
+ ):
+ raise ValueError(
+ f"cannot distribute {total_annotations} annotations across {annotator_count} sessions "
+ f"inside [{min_session_items}, {max_session_items}]"
+ )
+
+ overlap_items = selected_items[:overlap_tables]
+ unique_items = selected_items[overlap_tables:]
+ overlap_counts = balanced_counts(
+ overlap_tables * DEFAULT_REQUIRED_VOTES, annotator_count
+ )
+ overlap_assignments = choose_overlap_sessions(
+ overlap_items=overlap_items,
+ overlap_counts=overlap_counts,
+ seed=seed + annotator_count,
+ )
+
+ unique_counts = [
+ session_sizes[index] - len(overlap_assignments[index])
+ for index in range(annotator_count)
+ ]
+ if sum(unique_counts) != len(unique_items):
+ raise ValueError("unique assignment counts do not match remaining tables")
+
+ rng = random.Random(seed + 1000 + annotator_count)
+ unique_pool = list(unique_items)
+ rng.shuffle(unique_pool)
+
+ sessions: list[dict[str, Any]] = []
+ cursor = 0
+ for session_index in range(annotator_count):
+ agreement_records = [
+ {
+ **dict(item),
+ "study_assignment": "agreement",
+ "study_expected_votes": DEFAULT_REQUIRED_VOTES,
+ "study_session_slot": session_index + 1,
+ }
+ for item in overlap_assignments[session_index]
+ ]
+ unique_records = []
+ for _ in range(unique_counts[session_index]):
+ item = unique_pool[cursor]
+ cursor += 1
+ unique_records.append(
+ {
+ **dict(item),
+ "study_assignment": "single",
+ "study_expected_votes": 1,
+ "study_session_slot": session_index + 1,
+ }
+ )
+
+ manifest_items = agreement_records + unique_records
+ rng.shuffle(manifest_items)
+ sessions.append(
+ {
+ "slot": session_index + 1,
+ "target_items": len(manifest_items),
+ "agreement_items": len(agreement_records),
+ "single_items": len(unique_records),
+ "items": manifest_items,
+ }
+ )
+
+ return {
+ "annotator_count": annotator_count,
+ "session_item_counts": [session["target_items"] for session in sessions],
+ "overlap_tables": overlap_tables,
+ "unique_tables": total_tables,
+ "total_annotations": total_annotations,
+ "sessions": sessions,
+ }
+
+
+def build_study_bundle(
+ *,
+ source_manifest_path: Path,
+ annotator_count: int,
+ overlap_tables: int,
+ total_tables: int,
+ seed: int,
+ min_session_items: int,
+ max_session_items: int,
+) -> dict[str, Any]:
+ items = load_manifest_items(source_manifest_path)
+ selected = pick_study_tables(items, total_tables=total_tables, seed=seed)
+ session_payload = build_session_items(
+ selected_items=selected,
+ annotator_count=annotator_count,
+ overlap_tables=overlap_tables,
+ seed=seed,
+ min_session_items=min_session_items,
+ max_session_items=max_session_items,
+ )
+ return {
+ "bundle_type": "ocr_table_study_bundle",
+ "source_manifest_path": str(source_manifest_path),
+ "seed": seed,
+ "annotator_count": annotator_count,
+ "required_votes": DEFAULT_REQUIRED_VOTES,
+ "min_session_items": min_session_items,
+ "max_session_items": max_session_items,
+ "summary": {
+ "annotator_count": annotator_count,
+ "unique_tables": session_payload["unique_tables"],
+ "agreement_tables": session_payload["overlap_tables"],
+ "total_annotations": session_payload["total_annotations"],
+ "session_item_counts": session_payload["session_item_counts"],
+ },
+ "sessions": session_payload["sessions"],
+ }
+
+
+def write_bundle(path: Path, payload: dict[str, Any]) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ tmp = path.with_suffix(path.suffix + ".tmp")
+ tmp.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+ tmp.replace(path)
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="Build OCR table-study session bundles."
+ )
+ parser.add_argument("--source-manifest", type=Path, default=DEFAULT_SOURCE_MANIFEST)
+ parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--total-tables", type=int, default=DEFAULT_TOTAL_TABLES)
+ parser.add_argument(
+ "--min-session-items", type=int, default=DEFAULT_MIN_SESSION_ITEMS
+ )
+ parser.add_argument(
+ "--max-session-items", type=int, default=DEFAULT_MAX_SESSION_ITEMS
+ )
+ parser.add_argument(
+ "--annotators",
+ type=int,
+ nargs="+",
+ default=sorted(DEFAULT_OVERLAP_BY_ANNOTATORS),
+ help="Annotator counts to build bundles for, e.g. --annotators 14 15 16",
+ )
+ return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+ args = build_arg_parser().parse_args(argv)
+ for annotator_count in args.annotators:
+ if annotator_count not in DEFAULT_OVERLAP_BY_ANNOTATORS:
+ raise ValueError(
+ f"no default overlap setting for annotator count {annotator_count}"
+ )
+ overlap_tables = DEFAULT_OVERLAP_BY_ANNOTATORS[annotator_count]
+ bundle = build_study_bundle(
+ source_manifest_path=args.source_manifest,
+ annotator_count=annotator_count,
+ overlap_tables=overlap_tables,
+ total_tables=args.total_tables,
+ seed=args.seed,
+ min_session_items=args.min_session_items,
+ max_session_items=args.max_session_items,
+ )
+ output_path = args.output_dir / f"study_sessions_{annotator_count}.json"
+ write_bundle(output_path, bundle)
+ print(
+ json.dumps(
+ {
+ "annotator_count": annotator_count,
+ "overlap_tables": overlap_tables,
+ "output": str(output_path),
+ **bundle["summary"],
+ },
+ indent=2,
+ )
+ )
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/annotation_OCR/summarize.py b/annotation_OCR/summarize.py
new file mode 100644
index 0000000..31d565d
--- /dev/null
+++ b/annotation_OCR/summarize.py
@@ -0,0 +1,58 @@
+"""Regenerate OCR annotation session summaries."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from store import list_sessions, write_all_sessions_summary, write_summary_files
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="Regenerate OCR annotation summaries.")
+ parser.add_argument("--session-id", action="append", default=[])
+ parser.add_argument(
+ "--all",
+ action="store_true",
+ help="Regenerate summaries for every session under annotation_OCR/sessions.",
+ )
+ parser.add_argument(
+ "--combined-output",
+ type=Path,
+ default=None,
+ help="Optional path for the combined all-sessions CSV.",
+ )
+ return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+ args = build_arg_parser().parse_args(argv)
+ session_ids = list(args.session_id)
+ if args.all:
+ session_ids.extend(metadata["session_id"] for metadata in list_sessions())
+
+ seen = set()
+ regenerated = []
+ for session_id in session_ids:
+ if session_id in seen:
+ continue
+ seen.add(session_id)
+ regenerated.append(
+ {"session_id": session_id, **write_summary_files(session_id)}
+ )
+
+ combined = None
+ if args.all or args.combined_output:
+ combined = str(write_all_sessions_summary(args.combined_output))
+
+ print(
+ json.dumps(
+ {"regenerated": regenerated, "combined_summary_csv": combined}, indent=2
+ )
+ )
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/annotation_OCR/templates/index.html b/annotation_OCR/templates/index.html
new file mode 100644
index 0000000..bc13f29
--- /dev/null
+++ b/annotation_OCR/templates/index.html
@@ -0,0 +1,155 @@
+
+
+
+
+
+
+ OCR Annotation
+
+
+
+
+
+
+
+
+
OCR annotation
+
Loading session
+
+
+
+
+
+
+ Sessions
+ Prev
+ Next
+ Next open
+ ?
+
+
+
+
+
+
+
+
+
+
+
+
Raw image unavailable
+
+
+
+
+
+
+
+
Current item
+
Report
+
+
+
Industry
+
+
+
+
Ticker
+
+
+
+
Page
+
+
+
+
Signals
+
+
+
+
Mapping
+
+
+
+
+
+
+
Decision
+
+ Yes
+ No
+ Uncertain
+
+
+
+
+ Note, if needed
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/annotation_OCR/templates/landing.html b/annotation_OCR/templates/landing.html
new file mode 100644
index 0000000..5a69e0d
--- /dev/null
+++ b/annotation_OCR/templates/landing.html
@@ -0,0 +1,263 @@
+
+
+
+
+
+
+ OCR Annotation — Start
+
+
+
+
+
+
+
OCR Annotation
+
Enter your name to start a new annotation session, or resume an existing one below.
+
+
+
+
+
Resume existing session
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index cf0bb03..0d39bc9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,9 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
+ "bleach>=6.3.0",
+ "flask>=3.1.3",
+ "markdown>=3.10.2",
"openai>=2.33.0",
"pydantic>=2.13.3",
"tqdm>=4.67.3",
diff --git a/scripts/fix_broken_dollar_overlap.py b/scripts/fix_broken_dollar_overlap.py
new file mode 100644
index 0000000..f3ae8b4
--- /dev/null
+++ b/scripts/fix_broken_dollar_overlap.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+r"""Replace broken dollar markers in .mmd files using heuristic-based selection.
+
+Heuristic A (pair-based):
+- Adjacent marker pair "\\(" then "\\)" with no curly braces between them.
+
+Heuristic B (money-context):
+- Marker appears to precede an amount-like token or nearby money phrasing.
+- Excludes obvious math-like markup such as "\\( _{2}" and "\\( ^{TM}".
+
+Selection strategies:
+- money: use only money-context markers (higher recall; default).
+- overlap: use intersection of pair-based and money-context markers (higher precision).
+
+Always-on exact rule:
+- Replace exact table cell markers "\( " and "\) ".
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+from pathlib import Path
+from typing import Iterable
+
+MARKER_RE = re.compile(r"\\\(|\\\)")
+MONEY_NUM_RE = re.compile(r"^\s*[\(\-]?\d(?:[\d,]*\.?\d*)")
+MONEY_WORD_RE = re.compile(r"^.{0,24}\b(?:million|billion|thousand)\b", re.IGNORECASE)
+MONEY_PHRASE_RE = re.compile(
+ r"^.{0,30}\b(?:per\s+share|per\s+ton|per\s+gallon|per\s+bushel|market\s+value)\b",
+ re.IGNORECASE,
+)
+MATHISH_RE = re.compile(r"^\s*[_\^]?\s*\{")
+EXACT_TD_RE = re.compile(r"(\\\(|\\\)) ")
+
+
+def iter_mmd_files(root: Path) -> Iterable[Path]:
+ for path in root.rglob("*.mmd"):
+ if path.is_file():
+ yield path
+
+
+def get_markers(text: str) -> list[tuple[int, str]]:
+ return [(m.start(), m.group(0)) for m in MARKER_RE.finditer(text)]
+
+
+def select_user_markers(text: str, markers: list[tuple[int, str]]) -> set[int]:
+ selected: set[int] = set()
+ for i in range(len(markers) - 1):
+ pos_a, tok_a = markers[i]
+ pos_b, tok_b = markers[i + 1]
+ if tok_a != r"\(" or tok_b != r"\)":
+ continue
+ between = text[pos_a + 2 : pos_b]
+ if "{" in between or "}" in between:
+ continue
+ selected.add(pos_a)
+ selected.add(pos_b)
+ return selected
+
+
+def select_money_context_markers(text: str, markers: list[tuple[int, str]]) -> set[int]:
+ selected: set[int] = set()
+ for pos, _tok in markers:
+ after = text[pos + 2 : pos + 66]
+
+ # Exclude obvious math-like constructions: \( _{...}, \( ^{...}, \({ ...
+ if MATHISH_RE.match(after):
+ continue
+
+ is_money = bool(
+ MONEY_NUM_RE.match(after)
+ or MONEY_WORD_RE.match(after)
+ or MONEY_PHRASE_RE.match(after)
+ )
+ if is_money:
+ selected.add(pos)
+ return selected
+
+
+def select_exact_td_markers(text: str) -> set[int]:
+ # Capture the marker token position inside exact HTML cells like \( .
+ return {m.start(1) for m in EXACT_TD_RE.finditer(text)}
+
+
+def apply_replacements(
+ text: str, markers: list[tuple[int, str]], positions: set[int]
+) -> tuple[str, int]:
+ if not positions:
+ return text, 0
+
+ out: list[str] = []
+ cursor = 0
+ replaced = 0
+
+ for pos, _tok in markers:
+ if pos in positions:
+ out.append(text[cursor:pos])
+ out.append("$")
+ cursor = pos + 2
+ replaced += 1
+
+ out.append(text[cursor:])
+ return "".join(out), replaced
+
+
+def process_file(path: Path, dry_run: bool, strategy: str) -> dict[str, int]:
+ text = path.read_text(encoding="utf-8")
+ markers = get_markers(text)
+
+ user_positions = select_user_markers(text, markers)
+ money_positions = select_money_context_markers(text, markers)
+ overlap = user_positions & money_positions
+ td_exact_positions = select_exact_td_markers(text)
+
+ if strategy == "money":
+ selected_positions = money_positions | td_exact_positions
+ elif strategy == "overlap":
+ selected_positions = overlap | td_exact_positions
+ else:
+ raise ValueError(f"Unknown strategy: {strategy}")
+
+ updated_text, replaced = apply_replacements(text, markers, selected_positions)
+
+ changed = int(replaced > 0)
+ if replaced > 0 and not dry_run:
+ path.write_text(updated_text, encoding="utf-8")
+
+ return {
+ "markers": len(markers),
+ "user": len(user_positions),
+ "money": len(money_positions),
+ "overlap": len(overlap),
+ "td_exact": len(td_exact_positions),
+ "replaced": replaced,
+ "changed": changed,
+ }
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(
+ description=(
+ "Replace broken dollar markers in .mmd files using heuristic-based "
+ "selection."
+ )
+ )
+ parser.add_argument(
+ "directory", type=Path, help="Root directory to scan recursively"
+ )
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="Compute and report changes without writing files",
+ )
+ parser.add_argument(
+ "--verbose",
+ action="store_true",
+ help="Print per-file replacement counts",
+ )
+ parser.add_argument(
+ "--strategy",
+ choices=("money", "overlap"),
+ default="money",
+ help=(
+ "Replacement selection strategy: 'money' (higher recall, default) "
+ "or 'overlap' (higher precision)."
+ ),
+ )
+ args = parser.parse_args()
+
+ root = args.directory
+ if not root.exists() or not root.is_dir():
+ raise SystemExit(f"Directory not found or not a directory: {root}")
+
+ totals = {
+ "files": 0,
+ "markers": 0,
+ "user": 0,
+ "money": 0,
+ "overlap": 0,
+ "td_exact": 0,
+ "replaced": 0,
+ "changed": 0,
+ }
+
+ for path in iter_mmd_files(root):
+ stats = process_file(path, dry_run=args.dry_run, strategy=args.strategy)
+ totals["files"] += 1
+ totals["markers"] += stats["markers"]
+ totals["user"] += stats["user"]
+ totals["money"] += stats["money"]
+ totals["overlap"] += stats["overlap"]
+ totals["td_exact"] += stats["td_exact"]
+ totals["replaced"] += stats["replaced"]
+ totals["changed"] += stats["changed"]
+
+ if args.verbose and stats["replaced"] > 0:
+ print(f"{path}: replacements={stats['replaced']}")
+
+ mode = "DRY RUN" if args.dry_run else "APPLY"
+ print(f"MODE={mode}")
+ print(f"STRATEGY={args.strategy}")
+ print(f"FILES_SCANNED={totals['files']}")
+ print(f"TOTAL_MARKER_TOKENS={totals['markers']}")
+ print(f"USER_HEURISTIC_TOTAL={totals['user']}")
+ print(f"MONEY_HEURISTIC_TOTAL={totals['money']}")
+ print(f"OVERLAP_TOTAL={totals['overlap']}")
+ print(f"EXACT_TD_TOTAL={totals['td_exact']}")
+ print(f"REPLACEMENTS={totals['replaced']}")
+ print(f"FILES_CHANGED={totals['changed']}")
+
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/uv.lock b/uv.lock
index c0651b8..869ffcd 100644
--- a/uv.lock
+++ b/uv.lock
@@ -34,6 +34,9 @@ name = "ardian-dataset-bench"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
+ { name = "bleach" },
+ { name = "flask" },
+ { name = "markdown" },
{ name = "openai" },
{ name = "pydantic" },
{ name = "tqdm" },
@@ -42,6 +45,9 @@ dependencies = [
[package.metadata]
requires-dist = [
+ { name = "bleach", specifier = ">=6.3.0" },
+ { name = "flask", specifier = ">=3.1.3" },
+ { name = "markdown", specifier = ">=3.10.2" },
{ name = "openai", specifier = ">=2.33.0" },
{ name = "pydantic", specifier = ">=2.13.3" },
{ name = "tqdm", specifier = ">=4.67.3" },
@@ -61,6 +67,27 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
]
+[[package]]
+name = "bleach"
+version = "6.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "webencodings" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/07/18/3c8523962314be6bf4c8989c79ad9531c825210dd13a8669f6b84336e8bd/bleach-6.3.0.tar.gz", hash = "sha256:6f3b91b1c0a02bb9a78b5a454c92506aa0fdf197e1d5e114d2e00c6f64306d22", size = 203533, upload-time = "2025-10-27T17:57:39.211Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" },
+]
+
+[[package]]
+name = "blinker"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
+]
+
[[package]]
name = "certifi"
version = "2026.4.22"
@@ -172,6 +199,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" },
]
+[[package]]
+name = "click"
+version = "8.4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9b/98/518d8e5081007684232226f475082b30087d0f585e8457db087298259f49/click-8.4.1.tar.gz", hash = "sha256:918b5633eddf6b41c32d4f454bf0de810065c74e3f7dbf8ee5452f8be88d3e96", size = 353007, upload-time = "2026-05-22T04:08:37.769Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c7/0d/67e5b4109ea4a837e80daa87c2c696711955e40449a97e8926672534def2/click-8.4.1-py3-none-any.whl", hash = "sha256:482be17c6991b8c19c5429a1e995d9b0efdbb63172824c41f99965dc0ade8ec2", size = 116639, upload-time = "2026-05-22T04:08:35.26Z" },
+]
+
[[package]]
name = "colorama"
version = "0.4.6"
@@ -223,6 +262,23 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
]
+[[package]]
+name = "flask"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "blinker" },
+ { name = "click" },
+ { name = "itsdangerous" },
+ { name = "jinja2" },
+ { name = "markupsafe" },
+ { name = "werkzeug" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" },
+]
+
[[package]]
name = "frozendict"
version = "2.4.7"
@@ -278,6 +334,27 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl", hash = "sha256:892ea0cde124a99ce773decba204c5552b69c3c67ffd5f232eb7696135bc8bb3", size = 68629, upload-time = "2026-04-22T16:42:40.909Z" },
]
+[[package]]
+name = "itsdangerous"
+version = "2.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
+]
+
[[package]]
name = "jiter"
version = "0.14.0"
@@ -332,6 +409,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/2e/a44c20c58aeed0355f2d326969a181696aeb551a25195f47563908a815be/jiter-0.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:5419d4aa2024961da9fe12a9cfe7484996735dca99e8e090b5c88595ef1951ff", size = 191338, upload-time = "2026-04-10T14:28:02.853Z" },
]
+[[package]]
+name = "markdown"
+version = "3.10.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" },
+]
+
[[package]]
name = "markdown-it-py"
version = "4.0.0"
@@ -344,6 +430,58 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
]
+[[package]]
+name = "markupsafe"
+version = "3.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" },
+ { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" },
+ { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" },
+ { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" },
+ { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" },
+ { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" },
+ { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" },
+ { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" },
+ { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" },
+ { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" },
+ { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" },
+ { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" },
+ { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" },
+ { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" },
+ { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
+ { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" },
+ { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" },
+ { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" },
+ { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" },
+ { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" },
+ { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" },
+ { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" },
+ { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" },
+ { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" },
+ { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" },
+ { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" },
+ { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" },
+ { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" },
+ { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" },
+ { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" },
+ { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
+ { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
+ { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
+]
+
[[package]]
name = "mdurl"
version = "0.1.2"
@@ -698,6 +836,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
]
+[[package]]
+name = "webencodings"
+version = "0.5.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0b/02/ae6ceac1baeda530866a85075641cec12989bd8d31af6d5ab4a3e8c92f47/webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923", size = 9721, upload-time = "2017-04-05T20:21:34.189Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774, upload-time = "2017-04-05T20:21:32.581Z" },
+]
+
[[package]]
name = "websockets"
version = "16.0"
@@ -734,6 +881,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" },
]
+[[package]]
+name = "werkzeug"
+version = "3.1.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dd/b2/381be8cfdee792dd117872481b6e378f85c957dd7c5bca38897b08f765fd/werkzeug-3.1.8.tar.gz", hash = "sha256:9bad61a4268dac112f1c5cd4630a56ede601b6ed420300677a869083d70a4c44", size = 875852, upload-time = "2026-04-02T18:49:14.268Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/93/8c/2e650f2afeb7ee576912636c23ddb621c91ac6a98e66dc8d29c3c69446e1/werkzeug-3.1.8-py3-none-any.whl", hash = "sha256:63a77fb8892bf28ebc3178683445222aa500e48ebad5ec77b0ad80f8726b1f50", size = 226459, upload-time = "2026-04-02T18:49:12.72Z" },
+]
+
[[package]]
name = "yfinance"
version = "1.3.0"