diff --git a/.gitignore b/.gitignore index 894a2bc..4db1aa8 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,9 @@ doc_text_processing/CEO_word_extraction/cleaning_extractions/cleaned/ KPI_analysis/cache/ KPI_analysis/output/ +# OCR annotation artifacts +annotation_OCR/sessions/ + # VSCode settings .vscode/settings.json diff --git a/annotation_OCR/README.md b/annotation_OCR/README.md new file mode 100644 index 0000000..ebded4e --- /dev/null +++ b/annotation_OCR/README.md @@ -0,0 +1,250 @@ +# OCR Annotation Interface + +Browser interface for reviewing OCR table extraction quality. The app now +defaults to table-level items extracted from `*_det.mmd`, shows the isolated +HTML table in the extracted-content pane, and auto-centers the raw page image +on the detected table region while still allowing manual zoom-out for more +context. + +Annotations are stored under `annotation_OCR/sessions/` so quality labels can +later be joined to downstream benchmark outputs. + +## Run + +### Headless mode (recommended for multi-user) + +Start the server with no session arguments — annotators create/resume sessions +from the browser landing page. If `annotation_OCR/manifests/tables_5000.json` +exists, the server uses it automatically for fast session creation. Otherwise +it falls back to building a sampled table queue directly from the OCR corpus. + +```bash +uv run python annotation_OCR/server.py --host 0.0.0.0 --port 5050 +``` + +Then open `http://HOST:5050`. The landing page lets each user enter their name, +create a new session, or resume an existing one. No CLI or Python knowledge +needed on the annotator side. + +### Pre-created session (single-user / scripted) + +From the repository root: + +```bash +uv run python annotation_OCR/server.py \ + --session-name "table QA smoke" \ + --annotator "your-name" \ + --queue-mode tables \ + --sample-size 100 \ + --host 127.0.0.1 \ + --port 5050 +``` + +For a small smoke run: + +```bash +uv run python annotation_OCR/server.py \ + --session-name smoke \ + --annotator test \ + --queue-mode tables \ + --sample-size 20 \ + --limit-reports 2 \ + --host 127.0.0.1 \ + --port 5050 +``` + +To force the server to use an explicit precomputed manifest: + +```bash +uv run python annotation_OCR/server.py \ + --manifest-path annotation_OCR/manifests/tables_5000.json \ + --host 127.0.0.1 \ + --port 5050 +``` + +To use precomputed study-session bundles for a paper annotation round: + +```bash +uv run python annotation_OCR/server.py \ + --study-bundle annotation_OCR/manifests/study_sessions_15.json \ + --host 127.0.0.1 \ + --port 5050 +``` + +Each new session created from the landing page then receives the next fixed +session queue from that bundle, so the progress bar tracks a real per-annotator +target rather than the whole table pool. + +Resume an existing session: + +```bash +uv run python annotation_OCR/server.py --session-id SESSION_ID --host 127.0.0.1 --port 5050 +``` + +SSH port forwarding from a laptop: + +```bash +ssh -L 5050:127.0.0.1:5050 USER@SERVER +``` + +Then open `http://127.0.0.1:5050` locally. + +For table sessions, the extracted-content pane shows only the isolated table and +the raw-image pane auto-refocuses on the detected bounding box. Use `Refocus` +or press `F` to jump back to the table after manual exploration. + +## Precompute A Reusable 5,000-Table Manifest + +Build the reusable subset once offline: + +```bash +mkdir -p annotation_OCR/manifests + +uv run python annotation_OCR/ocr_index.py \ + --queue-mode tables \ + --sample-size 5000 \ + --seed 42 \ + --output annotation_OCR/manifests/tables_5000.json +``` + +That manifest can then be reused by the server so new annotation sessions do +not need to rescan the OCR corpus. + +## Build Study Session Bundles + +For hybrid annotation rounds, build one bundle for each possible annotator +count. The generated bundles already keep each session inside the target range +of 120 to 140 items: + +```bash +uv run python annotation_OCR/study_sessions.py \ + --source-manifest annotation_OCR/manifests/tables_5000.json \ + --output-dir annotation_OCR/manifests \ + --annotators 14 15 16 \ + --seed 42 +``` + +This writes: + +- `annotation_OCR/manifests/study_sessions_14.json` +- `annotation_OCR/manifests/study_sessions_15.json` +- `annotation_OCR/manifests/study_sessions_16.json` + +The 15- and 16-annotator bundles use 1500 unique tables with 300 triple-coded +agreement tables. The 14-annotator bundle lowers the agreement subset to 220 so +all session quotas still stay within the 120 to 140 target range. + +## Compute Agreement After Annotation + +After the study round, compute overlap agreement plus accept/reject ratios with: + +```bash +uv run python annotation_OCR/study_agreement.py \ + --study-bundle annotation_OCR/manifests/study_sessions_15.json +``` + +By default this writes analysis artifacts under: + +- `annotation_OCR/sessions/study_analysis/study_sessions_15/summary.md` +- `annotation_OCR/sessions/study_analysis/study_sessions_15/summary.json` +- `annotation_OCR/sessions/study_analysis/study_sessions_15/session_metrics.csv` +- `annotation_OCR/sessions/study_analysis/study_sessions_15/item_metrics.csv` + +The script auto-discovers sessions created from that bundle via their stored +`study_bundle_path` and `study_slot`. It reports exact agreement, pairwise +agreement, Fleiss' kappa, and accept/reject ratios both at the raw vote level +and at the final table-decision level. + +## Data Sources + +Defaults: + +- OCR Markdown root: `DeepSeekOCR_Ardian_pruned_1k/` +- Raw image root: `/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs/` +- Default reusable manifest path: `annotation_OCR/manifests/tables_5000.json` + +Each queued table item maps back to the raw PNG page with the same zero-based +page index, for example page index `12` maps to `pages/page_0012.png`. Table +items carry the `_det.mmd` bounding box used by the UI to center the preview. +The manifest records mapping warnings such as missing raw images or page-count +mismatches. + +## Queue Modes + +- `tables`: default. Queues table-level items from `*_det.mmd`. Use `--sample-size` for deterministic random sampling. +- `table-candidates`: legacy page-level mode. Keeps pages with table-like signals, dense numeric rows, financial statement headings, or KPI aliases. +- `all`: legacy page-level mode that queues every page. +- `sample`: legacy seeded random sample across all discovered pages. + +Indexer smoke check: + +```bash +uv run python annotation_OCR/ocr_index.py \ + --ocr-root DeepSeekOCR_Ardian_pruned_1k \ + --raw-root /data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs \ + --queue-mode tables \ + --sample-size 20 \ + --limit-reports 2 \ + --check +``` + +## Keyboard + +- `a`: mark Yes, save, advance +- `r`: mark No, save, advance +- `u`: mark Uncertain, save, advance +- `j` / right arrow: next page +- `k` / left arrow: previous page +- `+`, `-`, `0`: zoom / reset +- `f`: refocus on the detected table +- `?`: shortcut dialog + +Shortcuts are disabled while typing in notes or editing form controls. + +## Outputs + +Each session writes to `annotation_OCR/sessions/{session_id}/`: + +- `metadata.json`: session name, annotator, configuration, counts, timestamps. +- `manifest.json`: queued items and mapping diagnostics. +- `annotations.jsonl`: append-only event log, one saved annotation per line. +- `current_annotations.json`: latest annotation per item, written atomically. +- `summary.csv`: one row per queued item, including unreviewed items. +- `summary.md`: status-count overview. + +Regenerate summaries: + +```bash +uv run python annotation_OCR/summarize.py --session-id SESSION_ID +uv run python annotation_OCR/summarize.py --all +``` + +## Annotation Schema + +Primary fields: + +- `overall_status`: `ok`, `not_ok`, `uncertain`, or `unreviewed` +- `notes`: optional free text + +Identity fields include `industry_slug`, `report_name`, `exchange`, `ticker`, `year`, `page_index`, `page_number`, `mmd_path`, `raw_png_path`, and `page_text_sha256`. + +For table sessions, summary rows also include `item_kind`, `table_index`, +`table_row_count`, `table_col_count`, `det_mmd_path`, and `focus_bbox`. + +## Downstream Joins + +For table-level filtering, join annotation summaries on: + +```text +exchange, ticker, year, page_index, table_index +``` + +For report-level benchmark filtering, aggregate page labels to: + +```text +exchange, ticker, year +``` + +A conservative report-level rule is to exclude a report when any reviewed table +item is `not_ok`, or when the share of `uncertain` table items exceeds a +threshold chosen for the benchmark run. \ No newline at end of file diff --git a/annotation_OCR/__init__.py b/annotation_OCR/__init__.py new file mode 100644 index 0000000..e045a18 --- /dev/null +++ b/annotation_OCR/__init__.py @@ -0,0 +1 @@ +"""OCR annotation interface package.""" diff --git a/annotation_OCR/manifests/README.md b/annotation_OCR/manifests/README.md new file mode 100644 index 0000000..cba2bc8 --- /dev/null +++ b/annotation_OCR/manifests/README.md @@ -0,0 +1,46 @@ +# Table Manifests + +Place reusable sampled table manifests here. + +Recommended default: + +```bash +uv run python annotation_OCR/ocr_index.py \ + --queue-mode tables \ + --sample-size 5000 \ + --seed 42 \ + --output annotation_OCR/manifests/tables_5000.json +``` + +When `tables_5000.json` exists, `annotation_OCR/server.py` will use it by default for new sessions. + +## Study Session Bundles + +For paper annotation rounds, also build the headcount-specific session bundles: + +```bash +uv run python annotation_OCR/study_sessions.py \ + --source-manifest annotation_OCR/manifests/tables_5000.json \ + --output-dir annotation_OCR/manifests \ + --annotators 14 15 16 \ + --seed 42 +``` + +This creates: + +- `study_sessions_14.json` +- `study_sessions_15.json` +- `study_sessions_16.json` + +Use the bundle matching the final annotator count when starting the server: + +```bash +uv run python annotation_OCR/server.py \ + --study-bundle annotation_OCR/manifests/study_sessions_15.json +``` + +Why the 14-annotator bundle differs: + +- `1500 unique + 300 triple-coded` requires `2100` total annotations. +- That fits 15 or 16 annotators while keeping each session in the `120–140` range. +- For 14 annotators, the bundle uses `220` agreement tables instead, for `1940` total annotations and per-session targets of `138–139`. diff --git a/annotation_OCR/ocr_index.py b/annotation_OCR/ocr_index.py new file mode 100644 index 0000000..fb7ac21 --- /dev/null +++ b/annotation_OCR/ocr_index.py @@ -0,0 +1,928 @@ +"""Build OCR annotation queues. + +The annotation UI can work either at page level from canonical ``.mmd`` files +or at table level from ``*_det.mmd`` files that carry OCR coordinates. +Page positions are preserved exactly: page index ``i`` in an ``.mmd`` split +maps to ``pages/page_XXXX.png`` with the same zero-based index when the raw +image exists. +""" + +from __future__ import annotations + +import argparse +import html +import hashlib +import json +import random +import re +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + + +HERE = Path(__file__).resolve().parent +REPO_ROOT = HERE.parent + +DEFAULT_OCR_ROOT = REPO_ROOT / "DeepSeekOCR_Ardian_pruned_1k" +DEFAULT_RAW_ROOT = Path( + "/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs" +) + +PAGE_SPLIT_RE = re.compile(r"<---\s*Page Split\s*--->", re.IGNORECASE) +REPORT_NAME_RE = re.compile(r"^([A-Z0-9-]+)_(.+)_(\d{4})(?:_[0-9a-fA-F]{8,})?$") +HASH_SUFFIX_RE = re.compile(r"_[0-9a-fA-F]{8,}$") +DET_HEADER_RE = re.compile( + r"(?m)^<\|ref\|>([^<]+)<\|/ref\|><\|det\|>(.*?)<\|/det\|>\s*$" +) +HTML_ROW_RE = re.compile(r"]*>(.*?)", re.IGNORECASE | re.DOTALL) +HTML_CELL_RE = re.compile(r"]*>(.*?)", re.IGNORECASE | re.DOTALL) +HTML_TAG_RE = re.compile(r"<[^>]+>") + +CORE_KPI_ALIASES = { + "revenue": [ + "net sales", + "total net sales", + "sales revenue", + "revenues", + "revenue", + "net revenue", + ], + "gross_profit": ["gross profit", "gross margin"], + "operating_income": [ + "operating income", + "income from operations", + "operating profit", + ], + "net_income": [ + "net income", + "net earnings", + "net loss", + "net income attributable", + ], + "total_assets": ["total assets"], + "total_liabilities": ["total liabilities", "liabilities"], + "cash_and_equivalents": [ + "cash and cash equivalents", + "cash equivalents", + "cash, cash equivalents", + ], + "operating_cash_flow": [ + "net cash provided by operating activities", + "cash flow from operating activities", + "operating cash flow", + ], + "capex": [ + "capital expenditures", + "capital expenditure", + "additions to property, plant and equipment", + "purchase of property and equipment", + "additions of long-lived assets", + ], +} + +FINANCIAL_TABLE_HEADINGS = [ + "consolidated statement of operations", + "consolidated statements of operations", + "consolidated income statement", + "consolidated statements of income", + "consolidated balance sheet", + "consolidated balance sheets", + "consolidated cash flow statement", + "consolidated statements of cash flows", + "consolidated statement of cash flows", + "statements of comprehensive income", + "statement of financial position", + "notes to the consolidated financial statements", + "selected financial data", + "five year record", +] + +NUMERIC_ROW_RE = re.compile( + r"(? dict[str, Any]: + record = asdict(self) + if not include_text: + record.pop("page_text", None) + return record + + +def parse_report_name(name: str) -> tuple[str, str, int] | None: + match = REPORT_NAME_RE.match(name) + if not match: + return None + return match.group(1), match.group(2), int(match.group(3)) + + +def strip_hash_suffix(name: str) -> str: + return HASH_SUFFIX_RE.sub("", name) + + +def report_base_name(name: str) -> str: + parsed = parse_report_name(name) + if parsed is None: + return strip_hash_suffix(name) + exchange, ticker, year = parsed + return f"{exchange}_{ticker}_{year}" + + +def find_mmd(report_dir: Path) -> Path | None: + preferred = report_dir / f"{report_dir.name}.mmd" + if preferred.is_file(): + return preferred + + base_preferred = report_dir / f"{report_base_name(report_dir.name)}.mmd" + if base_preferred.is_file(): + return base_preferred + + candidates = sorted( + path for path in report_dir.glob("*.mmd") if not path.name.endswith("_det.mmd") + ) + if candidates: + return candidates[0] + + fallback = sorted(report_dir.glob("*.mmd")) + return fallback[0] if fallback else None + + +def find_det_mmd(report_dir: Path) -> Path | None: + preferred = report_dir / f"{report_dir.name}_det.mmd" + if preferred.is_file(): + return preferred + + base_preferred = report_dir / f"{report_base_name(report_dir.name)}_det.mmd" + if base_preferred.is_file(): + return base_preferred + + candidates = sorted(report_dir.glob("*_det.mmd")) + return candidates[0] if candidates else None + + +def discover_reports(root: Path) -> list[ReportInfo]: + reports: list[ReportInfo] = [] + seen_dirs = sorted({mmd.parent for mmd in root.rglob("*.mmd")}) + for report_dir in seen_dirs: + parsed = parse_report_name(report_dir.name) + if parsed is None: + continue + mmd_path = find_mmd(report_dir) + det_mmd_path = find_det_mmd(report_dir) + if mmd_path is None and det_mmd_path is None: + continue + exchange, ticker, year = parsed + industry_slug = report_dir.parent.name + reports.append( + ReportInfo( + industry_slug=industry_slug, + name=report_dir.name, + exchange=exchange, + ticker=ticker, + year=year, + report_dir=report_dir, + mmd_path=mmd_path or det_mmd_path, + det_mmd_path=det_mmd_path, + ) + ) + return reports + + +def split_pages(raw: str) -> list[str]: + pages = [page.strip() for page in PAGE_SPLIT_RE.split(raw)] + if pages and not pages[-1]: + pages.pop() + return pages + + +def load_pages(mmd_path: Path) -> list[str]: + raw = mmd_path.read_text(encoding="utf-8", errors="replace") + return split_pages(raw) + + +def parse_bboxes(raw: str) -> list[list[int]]: + coords = [int(value) for value in re.findall(r"-?\d+", raw)] + boxes: list[list[int]] = [] + for index in range(0, len(coords), 4): + chunk = coords[index : index + 4] + if len(chunk) == 4: + boxes.append(chunk) + return boxes + + +def parse_det_blocks(page_text: str) -> list[DetBlock]: + matches = list(DET_HEADER_RE.finditer(page_text)) + if not matches: + return [] + + blocks: list[DetBlock] = [] + for index, match in enumerate(matches): + payload_start = match.end() + payload_end = ( + matches[index + 1].start() if index + 1 < len(matches) else len(page_text) + ) + payload = page_text[payload_start:payload_end].strip() + bbox_raw = match.group(2).strip() + blocks.append( + DetBlock( + ref_type=match.group(1).strip().lower(), + bbox_raw=bbox_raw, + bboxes=parse_bboxes(bbox_raw), + payload=payload, + ) + ) + return blocks + + +def strip_html(value: str) -> str: + text = HTML_TAG_RE.sub(" ", value) + return " ".join(html.unescape(text).split()) + + +def table_dimensions(table_html: str) -> tuple[int, int]: + row_count = 0 + col_count = 0 + for row_html in HTML_ROW_RE.findall(table_html): + row_count += 1 + col_count = max(col_count, len(HTML_CELL_RE.findall(row_html))) + return row_count, col_count + + +def combined_bbox(bboxes: list[list[int]]) -> list[int] | None: + if not bboxes: + return None + return [ + min(box[0] for box in bboxes), + min(box[1] for box in bboxes), + max(box[2] for box in bboxes), + max(box[3] for box in bboxes), + ] + + +def nearby_context(blocks: list[DetBlock], block_index: int, *, direction: int) -> str: + collected: list[str] = [] + index = block_index + direction + while 0 <= index < len(blocks) and len(collected) < 2: + block = blocks[index] + if block.ref_type in {"text", "title", "sub_title"} and block.payload: + collected.append(strip_html(block.payload)) + index += direction + if direction < 0: + collected.reverse() + return "\n".join(value for value in collected if value) + + +def detect_table_reasons( + table_html: str, context_before: str, context_after: str +) -> list[str]: + reasons = ["det-table"] + seen = set(reasons) + for reason in detect_candidate_reasons( + "\n".join(part for part in [context_before, table_html, context_after] if part) + ): + if reason not in seen: + seen.add(reason) + reasons.append(reason) + return reasons + + +def resolve_raw_dir(report: ReportInfo, raw_root: Path) -> tuple[Path | None, str]: + industry_root = raw_root / report.industry_slug + if not industry_root.is_dir(): + return None, "raw-industry-missing" + + exact = industry_root / report.name + if exact.is_dir(): + return exact, "ok-exact" + + base_name = report_base_name(report.name) + stripped = industry_root / base_name + if stripped.is_dir(): + return stripped, "ok-hash-stripped" + + matches = sorted( + path for path in industry_root.glob(f"{base_name}*") if path.is_dir() + ) + if len(matches) == 1: + return matches[0], "ok-glob" + if len(matches) > 1: + return None, "raw-dir-ambiguous" + return None, "raw-dir-missing" + + +def list_page_pngs(raw_dir: Path | None) -> list[Path]: + if raw_dir is None: + return [] + pages_dir = raw_dir / "pages" + if not pages_dir.is_dir(): + return [] + return sorted(p for p in pages_dir.glob("page_*.png") if p.is_file()) + + +def resolve_table_source(report: ReportInfo, raw_root: Path) -> TableSourceInfo | None: + raw_dir, raw_status = resolve_raw_dir(report, raw_root) + if raw_dir is not None: + raw_det_mmd = find_det_mmd(raw_dir) + raw_mmd = find_mmd(raw_dir) + raw_page_pngs = list_page_pngs(raw_dir) + if raw_det_mmd is not None and raw_page_pngs: + return TableSourceInfo( + report_dir=raw_dir, + mmd_path=raw_mmd or raw_det_mmd, + det_mmd_path=raw_det_mmd, + page_pngs=raw_page_pngs, + mapping_status=raw_status, + ) + + local_det_mmd = report.det_mmd_path + if local_det_mmd is None: + return None + + fallback_page_pngs = list_page_pngs(raw_dir) + source_warning = None + if raw_dir is not None: + source_warning = "table-source-fallback-pruned-det" + mapping_status = raw_status + else: + source_warning = "table-source-no-raw-match" + mapping_status = "raw-dir-missing" + + return TableSourceInfo( + report_dir=report.report_dir, + mmd_path=report.mmd_path, + det_mmd_path=local_det_mmd, + page_pngs=fallback_page_pngs, + mapping_status=mapping_status, + source_warning=source_warning, + ) + + +def page_png_for(page_pngs: list[Path], page_index: int) -> Path | None: + expected_name = f"page_{page_index:04d}.png" + for path in page_pngs: + if path.name == expected_name: + return path + if 0 <= page_index < len(page_pngs): + return page_pngs[page_index] + return None + + +def has_markdown_table(lines: list[str]) -> bool: + if any(MARKDOWN_TABLE_SEPARATOR_RE.match(line) for line in lines): + return True + pipe_rows = sum(1 for line in lines if line.count("|") >= 2) + return pipe_rows >= 2 + + +def dense_numeric_row_count(lines: list[str]) -> int: + return sum(1 for line in lines if len(NUMERIC_ROW_RE.findall(line)) >= 3) + + +def detect_candidate_reasons(text: str) -> list[str]: + lowered = text.lower() + lines = [line.strip() for line in text.splitlines() if line.strip()] + reasons: list[str] = [] + + if has_markdown_table(lines): + reasons.append("markdown-table") + if "" in lowered or "" in lowered: + reasons.append("html-table") + + numeric_rows = dense_numeric_row_count(lines) + if numeric_rows >= 3: + reasons.append("dense-numeric-rows") + + if any(heading in lowered for heading in FINANCIAL_TABLE_HEADINGS): + reasons.append("financial-heading") + + aliases = sorted({alias for vals in CORE_KPI_ALIASES.values() for alias in vals}) + alias_hits = [alias for alias in aliases if alias in lowered] + if len(alias_hits) >= 2: + reasons.append("kpi-aliases") + + return reasons + + +def text_preview(text: str, max_chars: int = 500) -> str: + compact = " ".join(text.split()) + if len(compact) <= max_chars: + return compact + return compact[: max_chars - 1].rstrip() + "..." + + +def page_text_hash(text: str) -> str: + return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() + + +def make_mapping_warnings( + *, + raw_dir: Path | None, + page_pngs: list[Path], + page_index: int, + mmd_page_count: int, + extra_warnings: list[str] | None = None, +) -> list[str]: + warnings: list[str] = [] + if raw_dir is None: + warnings.append("raw-directory-missing") + elif not (raw_dir / "pages").is_dir(): + warnings.append("raw-pages-directory-missing") + if len(page_pngs) != mmd_page_count: + warnings.append("page-count-mismatch") + if page_png_for(page_pngs, page_index) is None: + warnings.append("raw-page-image-missing") + if extra_warnings: + warnings.extend(extra_warnings) + return warnings + + +def build_all_items( + *, + ocr_root: Path, + raw_root: Path, + limit_reports: int | None = None, +) -> list[PageItem]: + return list( + iter_page_items( + ocr_root=ocr_root, + raw_root=raw_root, + limit_reports=limit_reports, + ) + ) + + +def iter_page_items( + *, + ocr_root: Path, + raw_root: Path, + limit_reports: int | None = None, +): + reports = discover_reports(ocr_root) + if limit_reports is not None: + reports = reports[:limit_reports] + + for report in reports: + pages = load_pages(report.mmd_path) + raw_dir, raw_status = resolve_raw_dir(report, raw_root) + page_pngs = list_page_pngs(raw_dir) + mmd_page_count = len(pages) + png_page_count = len(page_pngs) + + for page_index, page_text in enumerate(pages): + raw_png = page_png_for(page_pngs, page_index) + warnings = make_mapping_warnings( + raw_dir=raw_dir, + page_pngs=page_pngs, + page_index=page_index, + mmd_page_count=mmd_page_count, + ) + reasons = detect_candidate_reasons(page_text) + item_id = f"{report.industry_slug}/{report.name}/page_{page_index:04d}" + yield PageItem( + item_id=item_id, + industry_slug=report.industry_slug, + report_name=report.name, + exchange=report.exchange, + ticker=report.ticker, + year=report.year, + page_index=page_index, + page_number=page_index + 1, + ocr_root=str(ocr_root), + raw_root=str(raw_root), + report_dir=str(report.report_dir), + raw_dir=str(raw_dir) if raw_dir else None, + mmd_path=str(report.mmd_path), + raw_png_path=str(raw_png) if raw_png else None, + mmd_page_count=mmd_page_count, + png_page_count=png_page_count, + mapping_status=raw_status, + mapping_warnings=warnings, + candidate_reasons=reasons, + page_text_sha256=page_text_hash(page_text), + page_text_chars=len(page_text), + page_text_preview=text_preview(page_text), + page_text="", + ) + + +def iter_table_items( + *, + ocr_root: Path, + raw_root: Path, + limit_reports: int | None = None, +): + reports = discover_reports(ocr_root) + if limit_reports is not None: + reports = reports[:limit_reports] + + for report in reports: + table_source = resolve_table_source(report, raw_root) + if table_source is None: + continue + + pages = load_pages(table_source.det_mmd_path) + raw_dir = table_source.report_dir + raw_status = table_source.mapping_status + page_pngs = table_source.page_pngs + mmd_page_count = len(pages) + png_page_count = len(page_pngs) + extra_warnings = ( + [table_source.source_warning] if table_source.source_warning else [] + ) + + for page_index, page_text in enumerate(pages): + blocks = parse_det_blocks(page_text) + if not blocks: + continue + + warnings = make_mapping_warnings( + raw_dir=raw_dir, + page_pngs=page_pngs, + page_index=page_index, + mmd_page_count=mmd_page_count, + extra_warnings=extra_warnings, + ) + raw_png = page_png_for(page_pngs, page_index) + table_index = 0 + + for block_index, block in enumerate(blocks): + if block.ref_type != "table" or not block.payload: + continue + + context_before = nearby_context(blocks, block_index, direction=-1) + context_after = nearby_context(blocks, block_index, direction=1) + row_count, col_count = table_dimensions(block.payload) + focus_bboxes = [list(box) for box in block.bboxes] + focus_bbox = combined_bbox(focus_bboxes) + reasons = detect_table_reasons( + block.payload, + context_before=context_before, + context_after=context_after, + ) + item_id = ( + f"{report.industry_slug}/{report.name}/page_{page_index:04d}" + f"/table_{table_index:03d}" + ) + preview_parts = [ + context_before, + strip_html(block.payload), + context_after, + ] + yield PageItem( + item_id=item_id, + industry_slug=report.industry_slug, + report_name=report.name, + exchange=report.exchange, + ticker=report.ticker, + year=report.year, + page_index=page_index, + page_number=page_index + 1, + ocr_root=str(ocr_root), + raw_root=str(raw_root), + report_dir=str(table_source.report_dir), + raw_dir=str(raw_dir) if raw_dir else None, + mmd_path=str(table_source.mmd_path), + raw_png_path=str(raw_png) if raw_png else None, + mmd_page_count=mmd_page_count, + png_page_count=png_page_count, + mapping_status=raw_status, + mapping_warnings=warnings, + candidate_reasons=reasons, + page_text_sha256=page_text_hash(block.payload), + page_text_chars=len(block.payload), + page_text_preview=text_preview( + "\n".join(part for part in preview_parts if part) + ), + page_text="", + item_kind="table", + det_mmd_path=str(table_source.det_mmd_path), + table_index=table_index, + table_row_count=row_count, + table_col_count=col_count, + focus_bbox=focus_bbox, + focus_bboxes=focus_bboxes, + table_html=block.payload, + context_before=context_before, + context_after=context_after, + ) + table_index += 1 + + +def new_summary_state() -> dict[str, Any]: + return { + "report_names": set(), + "page_keys": set(), + "items_total": 0, + "page_items_total": 0, + "table_items_total": 0, + "mapping_status_counts": {}, + "mapping_warning_counts": {}, + "candidate_reason_counts": {}, + } + + +def update_summary_state(state: dict[str, Any], item: PageItem) -> None: + state["report_names"].add(item.report_name) + state["page_keys"].add((item.report_name, item.page_index)) + state["items_total"] += 1 + if item.item_kind == "table": + state["table_items_total"] += 1 + else: + state["page_items_total"] += 1 + statuses = state["mapping_status_counts"] + statuses[item.mapping_status] = statuses.get(item.mapping_status, 0) + 1 + warnings = state["mapping_warning_counts"] + for warning in item.mapping_warnings: + warnings[warning] = warnings.get(warning, 0) + 1 + reasons = state["candidate_reason_counts"] + for reason in item.candidate_reasons: + reasons[reason] = reasons.get(reason, 0) + 1 + + +def finish_summary_state( + state: dict[str, Any], queue: list[PageItem] +) -> dict[str, Any]: + return { + "reports_total": len(state["report_names"]), + "pages_total": len(state["page_keys"]), + "items_total": state["items_total"], + "page_items_total": state["page_items_total"], + "table_items_total": state["table_items_total"], + "queue_reports": len({item.report_name for item in queue}), + "queue_pages": len({(item.report_name, item.page_index) for item in queue}), + "queue_items": len(queue), + "queue_table_items": sum(1 for item in queue if item.item_kind == "table"), + "mapping_status_counts": state["mapping_status_counts"], + "mapping_warning_counts": state["mapping_warning_counts"], + "candidate_reason_counts": state["candidate_reason_counts"], + } + + +def select_queue( + items: list[PageItem], + *, + queue_mode: str, + sample_size: int | None = None, + seed: int = 17, + limit: int | None = None, +) -> list[PageItem]: + if queue_mode == "all": + selected = list(items) + elif queue_mode == "table-candidates": + selected = [item for item in items if item.candidate_reasons] + elif queue_mode == "tables": + selected = list(items) + if sample_size is not None: + rng = random.Random(seed) + selected = rng.sample(selected, min(sample_size, len(selected))) + selected.sort( + key=lambda item: ( + item.industry_slug, + item.report_name, + item.page_index, + item.table_index or -1, + ) + ) + elif queue_mode == "sample": + size = sample_size if sample_size is not None else 100 + rng = random.Random(seed) + selected = rng.sample(items, min(size, len(items))) + selected.sort( + key=lambda item: (item.industry_slug, item.report_name, item.page_index) + ) + else: + raise ValueError(f"unknown queue mode: {queue_mode}") + + if limit is not None: + selected = selected[:limit] + return selected + + +def build_queue( + *, + ocr_root: Path, + raw_root: Path, + queue_mode: str = "tables", + sample_size: int | None = None, + seed: int = 17, + limit: int | None = None, + limit_reports: int | None = None, +) -> tuple[list[PageItem], dict[str, Any]]: + if queue_mode not in {"all", "table-candidates", "sample", "tables"}: + raise ValueError(f"unknown queue mode: {queue_mode}") + + queue: list[PageItem] = [] + summary_state = new_summary_state() + rng = random.Random(seed) + sample_seen = 0 + sample_target = sample_size if sample_size is not None else 100 + scan_stopped_by_limit = False + item_iterator = iter_table_items if queue_mode == "tables" else iter_page_items + + for item in item_iterator( + ocr_root=ocr_root, + raw_root=raw_root, + limit_reports=limit_reports, + ): + update_summary_state(summary_state, item) + if queue_mode == "sample" or ( + queue_mode == "tables" and sample_size is not None + ): + sample_seen += 1 + if len(queue) < sample_target: + queue.append(item) + else: + replace_at = rng.randint(0, sample_seen - 1) + if replace_at < sample_target: + queue[replace_at] = item + continue + + include_item = queue_mode in {"all", "tables"} or bool(item.candidate_reasons) + if not include_item: + continue + queue.append(item) + if limit is not None and len(queue) >= limit: + scan_stopped_by_limit = True + break + + if queue_mode == "sample" or (queue_mode == "tables" and sample_size is not None): + queue.sort( + key=lambda item: ( + item.industry_slug, + item.report_name, + item.page_index, + item.table_index or -1, + ) + ) + if limit is not None: + queue = queue[:limit] + + summary = finish_summary_state(summary_state, queue) + summary.update( + { + "queue_mode": queue_mode, + "sample_size": sample_size, + "seed": seed, + "limit": limit, + "limit_reports": limit_reports, + "scan_stopped_by_limit": scan_stopped_by_limit, + "ocr_root": str(ocr_root), + "raw_root": str(raw_root), + } + ) + return queue, summary + + +def summarize_items(all_items: list[PageItem], queue: list[PageItem]) -> dict[str, Any]: + report_names = {item.report_name for item in all_items} + queue_reports = {item.report_name for item in queue} + page_keys = {(item.report_name, item.page_index) for item in all_items} + warnings: dict[str, int] = {} + statuses: dict[str, int] = {} + reason_counts: dict[str, int] = {} + for item in all_items: + statuses[item.mapping_status] = statuses.get(item.mapping_status, 0) + 1 + for warning in item.mapping_warnings: + warnings[warning] = warnings.get(warning, 0) + 1 + for reason in item.candidate_reasons: + reason_counts[reason] = reason_counts.get(reason, 0) + 1 + return { + "reports_total": len(report_names), + "pages_total": len(page_keys), + "items_total": len(all_items), + "table_items_total": sum(1 for item in all_items if item.item_kind == "table"), + "queue_reports": len(queue_reports), + "queue_pages": len({(item.report_name, item.page_index) for item in queue}), + "queue_items": len(queue), + "mapping_status_counts": statuses, + "mapping_warning_counts": warnings, + "candidate_reason_counts": reason_counts, + } + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(payload, indent=2), encoding="utf-8") + tmp.replace(path) + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Build an OCR page annotation queue.") + parser.add_argument("--ocr-root", type=Path, default=DEFAULT_OCR_ROOT) + parser.add_argument("--raw-root", type=Path, default=DEFAULT_RAW_ROOT) + parser.add_argument( + "--queue-mode", + choices=["all", "table-candidates", "sample", "tables"], + default="tables", + ) + parser.add_argument("--sample-size", type=int, default=None) + parser.add_argument("--seed", type=int, default=17) + parser.add_argument("--limit", type=int, default=None, help="Maximum queued pages.") + parser.add_argument( + "--limit-reports", + type=int, + default=None, + help="Read only the first N reports before queue selection.", + ) + parser.add_argument( + "--output", type=Path, default=None, help="Optional manifest JSON path." + ) + parser.add_argument("--check", action="store_true", help="Print summary and exit.") + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + queue, summary = build_queue( + ocr_root=args.ocr_root, + raw_root=args.raw_root, + queue_mode=args.queue_mode, + sample_size=args.sample_size, + seed=args.seed, + limit=args.limit, + limit_reports=args.limit_reports, + ) + + payload = { + "summary": summary, + "items": [item.to_manifest_record() for item in queue], + } + if args.output: + write_json(args.output, payload) + if args.check or not args.output: + print(json.dumps(summary, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/annotation_OCR/server.py b/annotation_OCR/server.py new file mode 100644 index 0000000..727a04a --- /dev/null +++ b/annotation_OCR/server.py @@ -0,0 +1,570 @@ +"""Browser-based OCR annotation server.""" + +from __future__ import annotations + +import argparse +import json +import re +from functools import lru_cache +from pathlib import Path +from typing import Any + +import bleach +import markdown as markdown_lib +from flask import Flask, abort, jsonify, redirect, render_template, request, send_file + +from ocr_index import DEFAULT_OCR_ROOT, DEFAULT_RAW_ROOT, build_queue, load_pages +from store import ( + create_session, + list_sessions, + load_current_annotations, + load_manifest, + load_metadata, + save_annotation, + session_dir, + write_summary_files, +) + + +HERE = Path(__file__).resolve().parent +DEFAULT_TABLE_MANIFEST = HERE / "manifests" / "tables_5000.json" +IMAGE_REF_RE = re.compile(r"(!\[[^\]]*\]\()((?:\./)?images/[^)\s]+)(\))") + +ALLOWED_TAGS = set(bleach.sanitizer.ALLOWED_TAGS).union( + { + "p", + "br", + "pre", + "code", + "hr", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "table", + "thead", + "tbody", + "tfoot", + "tr", + "th", + "td", + "img", + "blockquote", + "del", + } +) +ALLOWED_ATTRIBUTES = { + **bleach.sanitizer.ALLOWED_ATTRIBUTES, + "a": ["href", "title", "rel", "target"], + "img": ["src", "alt", "title"], + "th": ["align", "colspan", "rowspan"], + "td": ["align", "colspan", "rowspan"], +} + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Run the OCR annotation web UI.") + parser.add_argument("--ocr-root", type=Path, default=DEFAULT_OCR_ROOT) + parser.add_argument("--raw-root", type=Path, default=DEFAULT_RAW_ROOT) + parser.add_argument( + "--session-id", default=None, help="Resume an existing session." + ) + parser.add_argument("--session-name", default="OCR annotation session") + parser.add_argument("--annotator", default="anonymous") + parser.add_argument( + "--study-bundle", + type=Path, + default=None, + help="Optional per-session study bundle. When set, each new session gets the next precomputed session queue.", + ) + parser.add_argument( + "--manifest-path", + type=Path, + default=DEFAULT_TABLE_MANIFEST if DEFAULT_TABLE_MANIFEST.is_file() else None, + help="Optional precomputed queue manifest to reuse instead of rescanning OCR files.", + ) + parser.add_argument( + "--queue-mode", + choices=["all", "table-candidates", "sample", "tables"], + default="tables", + ) + parser.add_argument("--sample-size", type=int, default=5000) + parser.add_argument("--seed", type=int, default=17) + parser.add_argument("--limit", type=int, default=None, help="Maximum queued items.") + parser.add_argument( + "--limit-reports", + type=int, + default=None, + help="Read only the first N reports before queue selection.", + ) + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=5050) + parser.add_argument("--debug", action="store_true") + return parser + + +def prepare_session(args: argparse.Namespace) -> str: + if args.session_id: + metadata = load_metadata(args.session_id) + return metadata["session_id"] + + manifest_items, index_summary, study_config = resolve_session_source( + study_bundle_path=args.study_bundle, + manifest_path=args.manifest_path, + ocr_root=args.ocr_root, + raw_root=args.raw_root, + queue_mode=args.queue_mode, + sample_size=args.sample_size, + seed=args.seed, + limit=args.limit, + limit_reports=args.limit_reports, + ) + config = { + "ocr_root": str(args.ocr_root), + "raw_root": str(args.raw_root), + "study_bundle_path": str(args.study_bundle.resolve()) + if args.study_bundle + else None, + "manifest_path": str(args.manifest_path) if args.manifest_path else None, + "queue_mode": args.queue_mode, + "sample_size": args.sample_size, + "seed": args.seed, + "limit": args.limit, + "limit_reports": args.limit_reports, + **study_config, + } + metadata = create_session( + session_name=args.session_name, + annotator=args.annotator, + manifest_items=manifest_items, + index_summary=index_summary, + config=config, + ) + return metadata["session_id"] + + +@lru_cache(maxsize=64) +def cached_pages(mmd_path: str) -> tuple[str, ...]: + return tuple(load_pages(Path(mmd_path))) + + +@lru_cache(maxsize=16) +def cached_manifest(session_id: str) -> tuple[dict[str, Any], ...]: + return tuple(load_manifest(session_id)) + + +def load_precomputed_manifest( + manifest_path: Path, +) -> tuple[list[dict[str, Any]], dict[str, Any]]: + payload = json.loads(manifest_path.read_text(encoding="utf-8")) + items = payload.get("items") + if not isinstance(items, list): + raise ValueError(f"invalid manifest items in {manifest_path}") + summary = payload.get("summary") or {} + if not isinstance(summary, dict): + raise ValueError(f"invalid manifest summary in {manifest_path}") + summary = {**summary, "manifest_path": str(manifest_path)} + return items, summary + + +def load_study_bundle(bundle_path: Path) -> dict[str, Any]: + payload = json.loads(bundle_path.read_text(encoding="utf-8")) + sessions = payload.get("sessions") + if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance( + sessions, list + ): + raise ValueError(f"invalid study bundle in {bundle_path}") + return payload + + +def claimed_study_slots(bundle_path: Path) -> set[int]: + resolved = str(bundle_path.resolve()) + claimed: set[int] = set() + for metadata in list_sessions(): + config = metadata.get("config") or {} + if config.get("study_bundle_path") != resolved: + continue + slot = config.get("study_slot") + if isinstance(slot, int): + claimed.add(slot) + elif isinstance(slot, str) and slot.isdigit(): + claimed.add(int(slot)) + return claimed + + +def allocate_study_session( + bundle_path: Path, +) -> tuple[list[dict[str, Any]], dict[str, Any], dict[str, Any]]: + bundle = load_study_bundle(bundle_path) + claimed = claimed_study_slots(bundle_path) + sessions = bundle["sessions"] + next_session = None + for entry in sessions: + slot = entry.get("slot") + if isinstance(slot, int) and slot not in claimed: + next_session = entry + break + if next_session is None: + raise ValueError(f"all study sessions already assigned for {bundle_path}") + + items = next_session.get("items") + if not isinstance(items, list): + raise ValueError(f"invalid study session items in {bundle_path}") + summary = bundle.get("summary") or {} + if not isinstance(summary, dict): + summary = {} + slot = int(next_session["slot"]) + summary = { + **summary, + "study_bundle_path": str(bundle_path.resolve()), + "study_slot": slot, + "study_target_items": next_session.get("target_items"), + "study_agreement_items": next_session.get("agreement_items"), + "study_single_items": next_session.get("single_items"), + } + config = { + "study_slot": slot, + "study_target_items": next_session.get("target_items"), + "study_agreement_items": next_session.get("agreement_items"), + "study_single_items": next_session.get("single_items"), + } + return items, summary, config + + +def resolve_session_source( + *, + study_bundle_path: Path | None, + manifest_path: Path | None, + ocr_root: Path, + raw_root: Path, + queue_mode: str, + sample_size: int | None, + seed: int, + limit: int | None, + limit_reports: int | None, +) -> tuple[list[dict[str, Any]], dict[str, Any], dict[str, Any]]: + if study_bundle_path is not None: + items, summary, config = allocate_study_session(study_bundle_path) + if limit is not None: + items = items[:limit] + summary = {**summary, "limit": limit} + config = {**config, "limit": limit} + return items, summary, config + + if manifest_path is not None: + items, summary = load_precomputed_manifest(manifest_path) + if limit is not None: + items = items[:limit] + summary = {**summary, "limit": limit} + return items, summary, {} + + queue, index_summary = build_queue( + ocr_root=ocr_root, + raw_root=raw_root, + queue_mode=queue_mode, + sample_size=sample_size, + seed=seed, + limit=limit, + limit_reports=limit_reports, + ) + return [item.to_manifest_record() for item in queue], index_summary, {} + + +def get_item_or_404(session_id: str, index: int) -> dict[str, Any]: + manifest = cached_manifest(session_id) + if index < 0 or index >= len(manifest): + abort(404, description="item index out of range") + return manifest[index] + + +def item_page_text(item: dict[str, Any]) -> str: + if item.get("item_kind") == "table": + return str(item.get("table_html") or "") + pages = cached_pages(item["mmd_path"]) + page_index = int(item.get("page_index", 0)) + if page_index < 0 or page_index >= len(pages): + return "" + return pages[page_index] + + +def omit_markdown_image_refs(markdown_text: str) -> str: + return IMAGE_REF_RE.sub( + lambda match: f"_[image omitted: {match.group(2)}]_", markdown_text + ) + + +def rewrite_markdown_image_refs(markdown_text: str, session_id: str, index: int) -> str: + def replace_md(match: re.Match[str]) -> str: + rel_path = match.group(2).lstrip("./") + src = f"/api/session/{session_id}/item/{index}/inline-image/{rel_path}" + return f"{match.group(1)}{src}{match.group(3)}" + + return IMAGE_REF_RE.sub(replace_md, markdown_text) + + +def render_markdown_page( + markdown_text: str, + *, + session_id: str, + index: int, + show_inline_images: bool, +) -> str: + if show_inline_images: + rewritten = rewrite_markdown_image_refs(markdown_text, session_id, index) + else: + rewritten = omit_markdown_image_refs(markdown_text) + html = markdown_lib.markdown( + rewritten, + extensions=["tables", "fenced_code", "sane_lists", "nl2br"], + output_format="html5", + ) + return bleach.clean( + html, + tags=ALLOWED_TAGS, + attributes=ALLOWED_ATTRIBUTES, + protocols=["http", "https", "mailto", "data"], + ) + + +def safe_child_path(root: Path, relative_path: str) -> Path: + candidate = Path(relative_path) + if candidate.is_absolute() or ".." in candidate.parts: + abort(400, description="unsafe path") + resolved_root = root.resolve() + target = (resolved_root / candidate).resolve() + if not target.is_relative_to(resolved_root): + abort(400, description="unsafe path") + return target + + +def progress_payload(session_id: str) -> dict[str, Any]: + metadata = load_metadata(session_id) + manifest = cached_manifest(session_id) + current = load_current_annotations(session_id) + status_counts: dict[str, int] = {} + for item in manifest: + status = current.get(item["item_id"], {}).get("overall_status", "unreviewed") + status_counts[status] = status_counts.get(status, 0) + 1 + + next_unreviewed_index = None + for index, item in enumerate(manifest): + if item["item_id"] not in current: + next_unreviewed_index = index + break + + return { + "metadata": metadata, + "item_count": len(manifest), + "reviewed_count": len(current), + "status_counts": status_counts, + "next_unreviewed_index": next_unreviewed_index, + } + + +def create_app(default_session_id: str | None, build_defaults: dict[str, Any]) -> Flask: + app = Flask(__name__, template_folder="templates", static_folder="static") + app.config["DEFAULT_SESSION_ID"] = default_session_id + app.config["BUILD_DEFAULTS"] = build_defaults + + @app.get("/") + def index() -> Any: + # If ?session= in URL, serve the annotation UI for that session + session_from_url = request.args.get("session") + if session_from_url: + return render_template("index.html", session_id=session_from_url) + # If server was started with a pre-created session, redirect to it + if default_session_id: + return redirect(f"/?session={default_session_id}") + # Otherwise show the landing / session picker page + return render_template("landing.html") + + @app.get("/api/sessions") + def api_sessions() -> Any: + return jsonify( + { + "sessions": list_sessions(), + "default_session_id": default_session_id or None, + } + ) + + @app.post("/api/sessions") + def api_create_session() -> Any: + payload = request.get_json(force=True, silent=True) or {} + defaults = app.config["BUILD_DEFAULTS"] + queue_mode = payload.get("queue_mode") or defaults["queue_mode"] + study_bundle_value = payload.get("study_bundle_path") or defaults.get( + "study_bundle_path" + ) + study_bundle_path = Path(study_bundle_value) if study_bundle_value else None + manifest_path_value = payload.get("manifest_path") or defaults.get( + "manifest_path" + ) + manifest_path = Path(manifest_path_value) if manifest_path_value else None + manifest_items, index_summary, study_config = resolve_session_source( + study_bundle_path=study_bundle_path, + manifest_path=manifest_path, + ocr_root=Path(payload.get("ocr_root") or defaults["ocr_root"]), + raw_root=Path(payload.get("raw_root") or defaults["raw_root"]), + queue_mode=queue_mode, + sample_size=payload.get("sample_size", defaults.get("sample_size")), + seed=int(payload.get("seed", defaults["seed"])), + limit=payload.get("limit", defaults.get("limit")), + limit_reports=payload.get("limit_reports", defaults.get("limit_reports")), + ) + config = { + "ocr_root": payload.get("ocr_root") or defaults["ocr_root"], + "raw_root": payload.get("raw_root") or defaults["raw_root"], + "study_bundle_path": str(study_bundle_path.resolve()) + if study_bundle_path + else None, + "manifest_path": str(manifest_path) if manifest_path else None, + "queue_mode": queue_mode, + "sample_size": payload.get("sample_size", defaults.get("sample_size")), + "seed": int(payload.get("seed", defaults["seed"])), + "limit": payload.get("limit", defaults.get("limit")), + "limit_reports": payload.get( + "limit_reports", defaults.get("limit_reports") + ), + **study_config, + } + metadata = create_session( + session_name=str(payload.get("session_name") or "OCR annotation session"), + annotator=str(payload.get("annotator") or "anonymous"), + manifest_items=manifest_items, + index_summary=index_summary, + config=config, + ) + cached_manifest.cache_clear() + return jsonify( + {"metadata": metadata, "progress": progress_payload(metadata["session_id"])} + ) + + @app.get("/api/session/") + def api_session(session_id: str) -> Any: + return jsonify(progress_payload(session_id)) + + @app.get("/api/session//item/") + def api_item(session_id: str, index: int) -> Any: + manifest = cached_manifest(session_id) + item = get_item_or_404(session_id, index) + text = item_page_text(item) + annotations = load_current_annotations(session_id) + show_inline_images = request.args.get("inline_images", "1") != "0" + next_image_url = None + if index + 1 < len(manifest) and manifest[index + 1].get("raw_png_path"): + next_image_url = f"/api/session/{session_id}/item/{index + 1}/raw-image" + return jsonify( + { + "index": index, + "item_count": len(manifest), + "item": item, + "annotation": annotations.get(item["item_id"]), + "page_text": text, + "markdown_html": render_markdown_page( + text, + session_id=session_id, + index=index, + show_inline_images=show_inline_images, + ), + "inline_images": show_inline_images, + "image_url": f"/api/session/{session_id}/item/{index}/raw-image", + "next_image_url": next_image_url, + } + ) + + @app.get("/api/session//item//raw-image") + def api_raw_image(session_id: str, index: int) -> Any: + item = get_item_or_404(session_id, index) + raw_png_path = item.get("raw_png_path") + if not raw_png_path: + abort(404, description="raw page image missing") + target = Path(raw_png_path).resolve() + raw_root = Path(item.get("raw_root") or "/").resolve() + if not target.is_relative_to(raw_root): + abort(400, description="raw image outside raw root") + if not target.is_file(): + abort(404, description="raw page image missing") + return send_file(target, conditional=True, max_age=86400) + + @app.get("/api/session//item//inline-image/") + def api_inline_image(session_id: str, index: int, rel_path: str) -> Any: + item = get_item_or_404(session_id, index) + report_dir = Path(item["report_dir"]) + target = safe_child_path(report_dir, rel_path) + if not target.is_file(): + abort(404, description="inline OCR image missing") + return send_file(target, conditional=True, max_age=86400) + + @app.post("/api/session//annotation") + def api_save_annotation(session_id: str) -> Any: + payload = request.get_json(force=True, silent=False) or {} + item_id = payload.get("item_id") + if not item_id: + abort(400, description="missing item_id") + record = save_annotation( + session_id=session_id, item_id=str(item_id), payload=payload + ) + return jsonify({"annotation": record, "progress": progress_payload(session_id)}) + + @app.get("/api/session//progress") + def api_progress(session_id: str) -> Any: + return jsonify(progress_payload(session_id)) + + @app.post("/api/session//summarize") + def api_summarize(session_id: str) -> Any: + paths = write_summary_files(session_id) + return jsonify({"paths": paths, "progress": progress_payload(session_id)}) + + @app.get("/api/session//summary.csv") + def api_summary_csv(session_id: str) -> Any: + write_summary_files(session_id) + return send_file(session_dir(session_id) / "summary.csv", as_attachment=True) + + @app.get("/api/session//summary.md") + def api_summary_md(session_id: str) -> Any: + write_summary_files(session_id) + return send_file(session_dir(session_id) / "summary.md", as_attachment=True) + + return app + + +def main(argv: list[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + # Session creation is now optional — if no --session-id given and + # --session-name is the default placeholder, start headless so users + # can create/resume sessions from the browser landing page. + session_id: str | None = None + if args.session_id: + session_id = prepare_session(args) + elif args.annotator != "anonymous" or args.session_name != "OCR annotation session": + session_id = prepare_session(args) + + build_defaults = { + "ocr_root": str(args.ocr_root), + "raw_root": str(args.raw_root), + "study_bundle_path": str(args.study_bundle.resolve()) + if args.study_bundle + else None, + "manifest_path": str(args.manifest_path) if args.manifest_path else None, + "queue_mode": args.queue_mode, + "sample_size": args.sample_size, + "seed": args.seed, + "limit": args.limit, + "limit_reports": args.limit_reports, + } + app = create_app(session_id, build_defaults) + if session_id: + print(f"Annotation session: {session_id}") + else: + print( + "Starting in headless mode — users will create sessions from the browser." + ) + print(f"Open: http://{args.host}:{args.port}") + app.run(host=args.host, port=args.port, debug=args.debug) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/annotation_OCR/static/app.js b/annotation_OCR/static/app.js new file mode 100644 index 0000000..d9a18c0 --- /dev/null +++ b/annotation_OCR/static/app.js @@ -0,0 +1,528 @@ +const state = { + sessionId: window.OCR_ANNOTATION_SESSION_ID + || new URLSearchParams(window.location.search).get('session') + || window.OCR_ANNOTATION_DEFAULT_SESSION_ID, + index: 0, + itemCount: 0, + item: null, + overallStatus: 'unreviewed', + startedAt: null, + zoom: 1, + showingRaw: false, + showInlineImages: true, + saving: false, + prefetchImage: null, +}; + +const IMAGE_STAGE_PADDING = 16; +const DET_COORD_MAX = 999; +const FOCUS_VIEWPORT_MARGIN = 12; +const FOCUS_BOX_OVERSCAN_X = 1.06; +const FOCUS_BOX_OVERSCAN_Y = 1.08; + +const els = { + sessionTitle: document.getElementById('sessionTitle'), + sessionMeta: document.getElementById('sessionMeta'), + progressText: document.getElementById('progressText'), + progressBar: document.getElementById('progressBar'), + prevButton: document.getElementById('prevButton'), + nextButton: document.getElementById('nextButton'), + skipReviewedButton: document.getElementById('skipReviewedButton'), + helpButton: document.getElementById('helpButton'), + imageStage: document.getElementById('imageStage'), + imageCanvas: document.getElementById('imageCanvas'), + rawImage: document.getElementById('rawImage'), + imageOverlay: document.getElementById('imageOverlay'), + imageMissing: document.getElementById('imageMissing'), + imageSubtitle: document.getElementById('imageSubtitle'), + markdownSubtitle: document.getElementById('markdownSubtitle'), + markdownPreview: document.getElementById('markdownPreview'), + rawMarkdown: document.getElementById('rawMarkdown'), + inlineImagesToggle: document.getElementById('inlineImagesToggle'), + toggleRawButton: document.getElementById('toggleRawButton'), + zoomOutButton: document.getElementById('zoomOutButton'), + zoomResetButton: document.getElementById('zoomResetButton'), + zoomInButton: document.getElementById('zoomInButton'), + refocusButton: document.getElementById('refocusButton'), + reportName: document.getElementById('reportName'), + industryValue: document.getElementById('industryValue'), + tickerValue: document.getElementById('tickerValue'), + pageValue: document.getElementById('pageValue'), + signalsValue: document.getElementById('signalsValue'), + mappingValue: document.getElementById('mappingValue'), + notesInput: document.getElementById('notesInput'), + saveButton: document.getElementById('saveButton'), + saveStatus: document.getElementById('saveStatus'), + summaryCsvLink: document.getElementById('summaryCsvLink'), + summaryMdLink: document.getElementById('summaryMdLink'), + helpDialog: document.getElementById('helpDialog'), +}; + +function apiJson(url, options = {}) { + return fetch(url, { + headers: { 'Content-Type': 'application/json' }, + ...options, + }).then(async (response) => { + if (!response.ok) { + const text = await response.text(); + throw new Error(text || `${response.status} ${response.statusText}`); + } + return response.json(); + }); +} + +function statusMessage(message, tone = 'neutral') { + els.saveStatus.textContent = message; + els.saveStatus.dataset.tone = tone; +} + +function formatList(values) { + if (!values || values.length === 0) return 'none'; + return values.join(', '); +} + +function updateProgress(progress) { + const metadata = progress.metadata || {}; + state.itemCount = progress.item_count || 0; + els.sessionTitle.textContent = metadata.session_name || metadata.session_id || 'Session'; + els.sessionMeta.textContent = `${metadata.annotator || 'anonymous'} · ${metadata.session_id || state.sessionId}`; + const reviewed = progress.reviewed_count || 0; + const total = progress.item_count || 0; + els.progressText.textContent = `${reviewed} / ${total} reviewed`; + els.progressBar.style.width = `${total ? Math.round((reviewed / total) * 100) : 0}%`; + els.summaryCsvLink.href = `/api/session/${state.sessionId}/summary.csv`; + els.summaryMdLink.href = `/api/session/${state.sessionId}/summary.md`; +} + +function setOverall(status) { + state.overallStatus = status; + document.querySelectorAll('.status-button').forEach((button) => { + button.classList.toggle('active', button.dataset.status === status); + }); +} + +function loadAnnotation(annotation) { + setOverall(annotation?.overall_status || 'unreviewed'); + els.notesInput.value = annotation?.notes || ''; +} + +function fittedImageWidth() { + const stage = els.imageCanvas.parentElement; + const availableWidth = Math.max(240, stage.clientWidth - 32); + const availableHeight = Math.max(240, stage.clientHeight - 32); + const naturalWidth = els.rawImage.naturalWidth || availableWidth; + const naturalHeight = els.rawImage.naturalHeight || naturalWidth * 1.414; + const fitScale = Math.min(availableWidth / naturalWidth, availableHeight / naturalHeight); + return Math.max(120, Math.floor(naturalWidth * fitScale)); +} + +function applyZoom() { + const placement = imagePlacement(); + els.imageCanvas.style.setProperty('--canvas-width', `${Math.round(placement.canvasWidth)}px`); + els.imageCanvas.style.setProperty('--canvas-height', `${Math.round(placement.canvasHeight)}px`); + els.imageCanvas.style.setProperty('--image-left', `${Math.round(placement.left)}px`); + els.imageCanvas.style.setProperty('--image-top', `${Math.round(placement.top)}px`); + els.imageCanvas.style.setProperty('--image-width', `${Math.round(placement.width)}px`); + els.imageCanvas.style.setProperty('--image-height', `${Math.round(placement.height)}px`); + renderFocusOverlay(); + els.zoomResetButton.textContent = `${Math.round(state.zoom * 100)}%`; +} + +function setZoom(value) { + state.zoom = Math.min(3, Math.max(0.35, value)); + applyZoom(); +} + +function scheduleAfterLayout(callback) { + window.requestAnimationFrame(() => { + callback(); + }); +} + +function hasTableFocus(item = state.item) { + return item?.item_kind === 'table' + && Array.isArray(item.focus_bbox) + && item.focus_bbox.length === 4 + && item.focus_bbox.every((value) => Number.isFinite(value)); +} + +function baseFittedImageSize() { + const width = fittedImageWidth(); + const naturalWidth = els.rawImage.naturalWidth || 1; + const naturalHeight = els.rawImage.naturalHeight || Math.max(1, naturalWidth * 1.414); + return { + width, + height: width * (naturalHeight / naturalWidth), + }; +} + +function scaledImageSize() { + const baseSize = baseFittedImageSize(); + return { + width: baseSize.width * state.zoom, + height: baseSize.height * state.zoom, + }; +} + +function imagePlacement() { + const { width, height } = scaledImageSize(); + const stageWidth = Math.max(1, els.imageStage.clientWidth); + const stageHeight = Math.max(1, els.imageStage.clientHeight); + const paddedWidth = width + (IMAGE_STAGE_PADDING * 2); + const paddedHeight = height + (IMAGE_STAGE_PADDING * 2); + const canvasWidth = Math.max(stageWidth, paddedWidth); + const canvasHeight = Math.max(stageHeight, paddedHeight); + + return { + width, + height, + canvasWidth, + canvasHeight, + left: IMAGE_STAGE_PADDING + Math.max(0, (canvasWidth - paddedWidth) / 2), + top: IMAGE_STAGE_PADDING + Math.max(0, (canvasHeight - paddedHeight) / 2), + }; +} + +function tableFocusPoint() { + if (!hasTableFocus()) return null; + const [left, top, right, bottom] = state.item.focus_bbox; + return { + x: ((left + right) / 2) / DET_COORD_MAX, + y: ((top + bottom) / 2) / DET_COORD_MAX, + }; +} + +function viewportCenterPoint() { + if (!els.rawImage.naturalWidth || !els.rawImage.naturalHeight) { + return { x: 0.5, y: 0.5 }; + } + const placement = imagePlacement(); + return { + x: (els.imageStage.scrollLeft + (els.imageStage.clientWidth / 2) - placement.left) / placement.width, + y: (els.imageStage.scrollTop + (els.imageStage.clientHeight / 2) - placement.top) / placement.height, + }; +} + +function centerViewportOnPoint(point) { + if (!point || !els.rawImage.naturalWidth || !els.rawImage.naturalHeight) return; + const placement = imagePlacement(); + els.imageStage.scrollLeft = Math.max( + 0, + placement.left + (point.x * placement.width) - (els.imageStage.clientWidth / 2), + ); + els.imageStage.scrollTop = Math.max( + 0, + placement.top + (point.y * placement.height) - (els.imageStage.clientHeight / 2), + ); +} + +function tableBoxes() { + if (Array.isArray(state.item?.focus_bboxes) && state.item.focus_bboxes.length > 0) { + return state.item.focus_bboxes; + } + if (Array.isArray(state.item?.focus_bbox) && state.item.focus_bbox.length === 4) { + return [state.item.focus_bbox]; + } + return []; +} + +function bboxToDisplayRect(bbox) { + const placement = imagePlacement(); + return { + left: placement.left + (bbox[0] / DET_COORD_MAX) * placement.width, + top: placement.top + (bbox[1] / DET_COORD_MAX) * placement.height, + width: Math.max(1, ((bbox[2] - bbox[0]) / DET_COORD_MAX) * placement.width), + height: Math.max(1, ((bbox[3] - bbox[1]) / DET_COORD_MAX) * placement.height), + }; +} + +function clearFocusOverlay() { + els.imageOverlay.replaceChildren(); + els.imageOverlay.hidden = true; +} + +function renderFocusOverlay() { + if (!hasTableFocus() || !els.rawImage.naturalWidth || !els.rawImage.naturalHeight || els.rawImage.hidden) { + clearFocusOverlay(); + return; + } + + const boxes = tableBoxes().map((bbox) => { + const rect = bboxToDisplayRect(bbox); + const box = document.createElement('div'); + box.className = 'focus-box'; + box.style.left = `${rect.left}px`; + box.style.top = `${rect.top}px`; + box.style.width = `${rect.width}px`; + box.style.height = `${rect.height}px`; + return box; + }); + + els.imageOverlay.replaceChildren(...boxes); + els.imageOverlay.hidden = false; +} + +function zoomAroundPoint(value, point) { + setZoom(value); + scheduleAfterLayout(() => centerViewportOnPoint(point)); +} + +function adjustZoom(delta) { + const nextZoom = state.zoom + delta; + const anchorPoint = tableFocusPoint() || viewportCenterPoint(); + zoomAroundPoint(nextZoom, anchorPoint); +} + +function clamp(value, min, max) { + return Math.min(max, Math.max(min, value)); +} + +function focusCurrentItem({ resetZoom = true } = {}) { + if (!hasTableFocus() || !els.rawImage.naturalWidth || !els.rawImage.naturalHeight) { + if (resetZoom) setZoom(1); + return; + } + + const [left, top, right, bottom] = state.item.focus_bbox; + const boxWidthRatio = Math.max(1 / DET_COORD_MAX, (right - left) / DET_COORD_MAX); + const boxHeightRatio = Math.max(1 / DET_COORD_MAX, (bottom - top) / DET_COORD_MAX); + const baseSize = baseFittedImageSize(); + const focusPoint = tableFocusPoint(); + + if (resetZoom) { + const availableWidth = Math.max( + 180, + els.imageStage.clientWidth - FOCUS_VIEWPORT_MARGIN, + ); + const availableHeight = Math.max( + 180, + els.imageStage.clientHeight - FOCUS_VIEWPORT_MARGIN, + ); + const paddedWidth = Math.max( + 24, + boxWidthRatio * baseSize.width * FOCUS_BOX_OVERSCAN_X, + ); + const paddedHeight = Math.max( + 24, + boxHeightRatio * baseSize.height * FOCUS_BOX_OVERSCAN_Y, + ); + const targetZoom = clamp( + Math.min(availableWidth / paddedWidth, availableHeight / paddedHeight), + 0.35, + 3, + ); + zoomAroundPoint(targetZoom, focusPoint); + return; + } + + scheduleAfterLayout(() => centerViewportOnPoint(focusPoint)); +} + +async function loadProgress() { + const progress = await apiJson(`/api/session/${state.sessionId}`); + updateProgress(progress); + return progress; +} + +function prefetchNextImage(url) { + if (!url) return; + state.prefetchImage = new Image(); + state.prefetchImage.decoding = 'async'; + state.prefetchImage.src = url; +} + +function resetExtractedContentScroll() { + els.markdownPreview.scrollTop = 0; + els.rawMarkdown.scrollTop = 0; +} + +async function loadItem(index) { + const safeIndex = Math.max(0, Math.min(index, Math.max(0, state.itemCount - 1))); + const inlineFlag = state.showInlineImages ? '1' : '0'; + const data = await apiJson(`/api/session/${state.sessionId}/item/${safeIndex}?inline_images=${inlineFlag}`); + state.index = safeIndex; + state.item = data.item; + state.itemCount = data.item_count; + state.startedAt = new Date(); + + els.reportName.textContent = data.item.report_name; + els.industryValue.textContent = data.item.industry_slug; + els.tickerValue.textContent = `${data.item.exchange}:${data.item.ticker} · ${data.item.year}`; + els.pageValue.textContent = data.item.item_kind === 'table' + ? `${data.item.page_number} / ${data.item.mmd_page_count} · Table ${(data.item.table_index ?? 0) + 1}` + : `${data.item.page_number} / ${data.item.mmd_page_count}`; + els.signalsValue.textContent = formatList(data.item.candidate_reasons); + els.mappingValue.textContent = [data.item.mapping_status, ...data.item.mapping_warnings].filter(Boolean).join(' · '); + els.imageSubtitle.textContent = data.item.raw_png_path || 'No raw image path'; + els.markdownSubtitle.textContent = `${data.item.page_text_chars} chars · ${data.item.page_text_sha256.slice(0, 12)}`; + + els.markdownPreview.innerHTML = data.markdown_html || ''; + els.rawMarkdown.textContent = data.page_text || ''; + resetExtractedContentScroll(); + clearFocusOverlay(); + + if (data.item.raw_png_path) { + els.rawImage.hidden = false; + els.imageMissing.hidden = true; + els.rawImage.src = `${data.image_url}?v=${encodeURIComponent(data.item.page_text_sha256)}`; + prefetchNextImage(data.next_image_url); + } else { + els.rawImage.hidden = true; + els.imageMissing.hidden = false; + els.rawImage.removeAttribute('src'); + setZoom(1); + clearFocusOverlay(); + } + + loadAnnotation(data.annotation); + statusMessage(`Loaded item ${safeIndex + 1} of ${data.item_count}`); + els.prevButton.disabled = safeIndex === 0; + els.nextButton.disabled = safeIndex >= data.item_count - 1; +} + +function annotationPayload(source = 'manual') { + return { + item_id: state.item.item_id, + overall_status: state.overallStatus, + notes: els.notesInput.value, + annotation_source: source, + review_duration_ms: state.startedAt ? new Date() - state.startedAt : null, + client_started_at_utc: state.startedAt ? state.startedAt.toISOString() : null, + client_updated_at_utc: new Date().toISOString(), + }; +} + +async function saveAnnotation(source = 'manual', advance = false) { + if (!state.item || state.saving) return; + state.saving = true; + els.saveButton.disabled = true; + statusMessage('Saving...'); + try { + const data = await apiJson(`/api/session/${state.sessionId}/annotation`, { + method: 'POST', + body: JSON.stringify(annotationPayload(source)), + }); + updateProgress(data.progress); + statusMessage('Saved', 'ok'); + if (advance && state.index < state.itemCount - 1) { + await loadItem(state.index + 1); + await loadProgress(); + } + } catch (error) { + statusMessage(`Save failed: ${error.message}`, 'error'); + } finally { + state.saving = false; + els.saveButton.disabled = false; + } +} + +function quickMark(status, source = 'shortcut') { + setOverall(status); + saveAnnotation(`${source}:${status}`, true); +} + +async function go(delta) { + const target = state.index + delta; + if (target < 0 || target >= state.itemCount) return; + await loadItem(target); + await loadProgress(); +} + +async function goNextOpen() { + const progress = await loadProgress(); + if (progress.next_unreviewed_index === null || progress.next_unreviewed_index === undefined) { + statusMessage('No open items'); + return; + } + await loadItem(progress.next_unreviewed_index); +} + +function toggleRawMarkdown() { + state.showingRaw = !state.showingRaw; + els.rawMarkdown.hidden = !state.showingRaw; + els.markdownPreview.hidden = state.showingRaw; + els.toggleRawButton.textContent = state.showingRaw ? 'Rendered' : 'Raw Markdown'; +} + +function inputHasFocus() { + const active = document.activeElement; + return active && ['TEXTAREA', 'INPUT', 'SELECT'].includes(active.tagName); +} + +function setupEvents() { + els.prevButton.addEventListener('click', () => go(-1)); + els.nextButton.addEventListener('click', () => go(1)); + els.skipReviewedButton.addEventListener('click', goNextOpen); + els.saveButton.addEventListener('click', () => saveAnnotation('manual', false)); + els.inlineImagesToggle.addEventListener('change', () => { + state.showInlineImages = els.inlineImagesToggle.checked; + loadItem(state.index); + }); + els.toggleRawButton.addEventListener('click', toggleRawMarkdown); + els.zoomOutButton.addEventListener('click', () => adjustZoom(-0.15)); + els.zoomInButton.addEventListener('click', () => adjustZoom(0.15)); + els.zoomResetButton.addEventListener('click', () => focusCurrentItem({ resetZoom: true })); + els.refocusButton.addEventListener('click', () => focusCurrentItem({ resetZoom: true })); + els.helpButton.addEventListener('click', () => els.helpDialog.showModal()); + els.rawImage.addEventListener('load', () => focusCurrentItem({ resetZoom: true })); + window.addEventListener('resize', () => { + const anchorPoint = tableFocusPoint() || viewportCenterPoint(); + applyZoom(); + scheduleAfterLayout(() => centerViewportOnPoint(anchorPoint)); + }); + document.querySelectorAll('.status-button').forEach((button) => { + button.addEventListener('click', () => quickMark(button.dataset.status, 'button')); + }); + + document.addEventListener('keydown', (event) => { + if (inputHasFocus()) return; + if (event.key === '?') { + event.preventDefault(); + els.helpDialog.showModal(); + } else if (event.key.toLowerCase() === 'a') { + event.preventDefault(); + quickMark('ok'); + } else if (event.key.toLowerCase() === 'r') { + event.preventDefault(); + quickMark('not_ok'); + } else if (event.key.toLowerCase() === 'u') { + event.preventDefault(); + quickMark('uncertain'); + } else if (event.key === 'ArrowRight' || event.key.toLowerCase() === 'j') { + event.preventDefault(); + go(1); + } else if (event.key === 'ArrowLeft' || event.key.toLowerCase() === 'k') { + event.preventDefault(); + go(-1); + } else if (event.key === '+' || event.key === '=') { + event.preventDefault(); + adjustZoom(0.15); + } else if (event.key === '-') { + event.preventDefault(); + adjustZoom(-0.15); + } else if (event.key === '0') { + event.preventDefault(); + focusCurrentItem({ resetZoom: true }); + } else if (event.key.toLowerCase() === 'f') { + event.preventDefault(); + focusCurrentItem({ resetZoom: true }); + } + }); +} + +async function init() { + setupEvents(); + try { + const progress = await loadProgress(); + const startIndex = progress.next_unreviewed_index ?? 0; + if (progress.item_count > 0) { + await loadItem(startIndex); + } else { + statusMessage('Session has no queued items', 'error'); + } + } catch (error) { + statusMessage(`Startup failed: ${error.message}`, 'error'); + } +} + +init(); \ No newline at end of file diff --git a/annotation_OCR/static/style.css b/annotation_OCR/static/style.css new file mode 100644 index 0000000..24f39fe --- /dev/null +++ b/annotation_OCR/static/style.css @@ -0,0 +1,522 @@ +:root { + --bg: #edf1f2; + --panel: #fbfcfa; + --panel-2: #f5f7f4; + --ink: #1d2528; + --muted: #5b686d; + --line: #cdd7d8; + --teal: #08746f; + --teal-dark: #075854; + --red: #aa3d2d; + --amber: #a06010; + --green: #2d7434; + --shadow: 0 18px 45px rgba(31, 45, 49, 0.14); + --mono: "JetBrains Mono", "IBM Plex Mono", "Cascadia Mono", monospace; + --sans: "Aptos", "Source Sans 3", "Segoe UI", sans-serif; +} + +* { + box-sizing: border-box; +} + +body { + margin: 0; + min-height: 100vh; + background: + linear-gradient(135deg, rgba(8, 116, 111, 0.09), transparent 34%), + linear-gradient(315deg, rgba(170, 61, 45, 0.08), transparent 36%), + var(--bg); + color: var(--ink); + font-family: var(--sans); +} + +button, +select, +textarea { + font: inherit; +} + +button, +.secondary-link { + border: 1px solid var(--line); + background: var(--panel); + color: var(--ink); + min-height: 36px; + padding: 0 12px; + border-radius: 6px; + cursor: pointer; + text-decoration: none; + display: inline-flex; + align-items: center; + justify-content: center; + white-space: nowrap; +} + +button:hover, +.secondary-link:hover { + border-color: var(--teal); +} + +.topbar { + position: sticky; + top: 0; + z-index: 20; + display: grid; + grid-template-columns: minmax(280px, 1fr) minmax(260px, 420px) auto; + gap: 18px; + align-items: center; + padding: 14px 18px; + background: rgba(251, 252, 250, 0.94); + border-bottom: 1px solid var(--line); + backdrop-filter: blur(14px); +} + +.eyebrow, +.section-label { + color: var(--muted); + font-size: 11px; + font-weight: 700; + letter-spacing: 0; + text-transform: uppercase; +} + +.session-title { + font-size: 18px; + font-weight: 800; +} + +.session-meta, +.pane-subtitle, +.save-status { + color: var(--muted); + font-size: 12px; +} + +.pane-subtitle { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.progress-block { + display: grid; + gap: 7px; +} + +.progress-track { + width: 100%; + height: 8px; + overflow: hidden; + background: #dce3e4; + border-radius: 999px; +} + +.progress-track div { + width: 0%; + height: 100%; + background: linear-gradient(90deg, var(--teal), #6a8d28); + transition: width 160ms ease; +} + +.nav-actions, +.zoom-actions, +.panel-actions { + display: flex; + gap: 8px; + align-items: center; + flex: 0 0 auto; +} + +.icon-button { + width: 36px; + padding: 0; + font-weight: 800; +} + +.workspace { + display: grid; + grid-template-columns: minmax(280px, 1.05fr) minmax(280px, 1fr) minmax(280px, 340px); + gap: 14px; + padding: 14px; + height: calc(100vh - 82px); +} + +.pane, +.annotation-panel { + min-width: 0; + min-height: 0; + background: var(--panel); + border: 1px solid var(--line); + border-radius: 8px; + box-shadow: var(--shadow); +} + +.pane { + display: grid; + grid-template-rows: auto minmax(0, 1fr); + overflow: hidden; +} + +.pane-toolbar { + display: flex; + justify-content: space-between; + gap: 12px; + align-items: center; + min-width: 0; + overflow: hidden; + padding: 12px; + border-bottom: 1px solid var(--line); + background: var(--panel-2); +} + +.pane-toolbar>div:first-child { + flex: 1 1 auto; + min-width: 0; +} + +.zoom-actions { + margin-left: auto; +} + +.preview-actions { + display: flex; + flex: 0 0 auto; + align-items: center; + gap: 10px; +} + +.toggle-control { + display: inline-flex; + align-items: center; + gap: 6px; + color: var(--muted); + font-size: 12px; + white-space: nowrap; +} + +.zoom-actions button { + width: 36px; + padding: 0; +} + +.zoom-actions #zoomResetButton { + width: 58px; +} + +.pane-title { + font-size: 15px; + font-weight: 800; +} + +.image-stage { + position: relative; + overflow: auto; + display: block; + background: + linear-gradient(45deg, #dce3e4 25%, transparent 25%), + linear-gradient(-45deg, #dce3e4 25%, transparent 25%), + linear-gradient(45deg, transparent 75%, #dce3e4 75%), + linear-gradient(-45deg, transparent 75%, #dce3e4 75%); + background-size: 22px 22px; + background-position: 0 0, 0 11px, 11px -11px, -11px 0; +} + +.image-canvas { + --image-width: 320px; + --image-height: 453px; + --image-left: 16px; + --image-top: 16px; + --canvas-width: 100%; + --canvas-height: 100%; + position: relative; + width: var(--canvas-width); + min-width: var(--canvas-width); + height: var(--canvas-height); + min-height: var(--canvas-height); +} + +.image-overlay { + position: absolute; + inset: 0; + pointer-events: none; +} + +.focus-box { + position: absolute; + border: 1px solid rgba(204, 20, 20, 0.95); + border-radius: 4px; + background: rgba(204, 20, 20, 0.06); + box-shadow: + 0 0 0 1px rgba(255, 255, 255, 0.8) inset, + 0 0 0 1px rgba(204, 20, 20, 0.35); +} + +#rawImage { + position: absolute; + left: var(--image-left); + top: var(--image-top); + width: var(--image-width); + height: var(--image-height); + max-width: none; + border: 1px solid #b7c3c5; + background: white; + box-shadow: 0 12px 28px rgba(31, 45, 49, 0.18); +} + +.missing-state { + margin: 40px auto; + padding: 20px; + border: 1px dashed var(--red); + background: #fff7f3; + color: var(--red); + border-radius: 8px; +} + +.markdown-preview, +.raw-markdown { + overflow: auto; + margin: 0; + padding: 18px; +} + +.markdown-preview { + line-height: 1.48; +} + +.markdown-preview h1, +.markdown-preview h2, +.markdown-preview h3 { + margin: 1.2em 0 0.45em; + line-height: 1.15; +} + +.markdown-preview table { + width: max-content; + max-width: 100%; + border-collapse: collapse; + margin: 14px 0; + font-size: 13px; +} + +.markdown-preview th, +.markdown-preview td { + border: 1px solid #b9c4c6; + padding: 6px 8px; + vertical-align: top; +} + +.markdown-preview th { + background: #e3eceb; +} + +.markdown-preview img { + max-width: 100%; + height: auto; + border: 1px solid var(--line); +} + +.raw-markdown { + font-family: var(--mono); + font-size: 12px; + line-height: 1.45; + white-space: pre-wrap; + background: #172225; + color: #e7eeed; +} + +.annotation-panel { + display: flex; + flex-direction: column; + overflow: auto; + padding: 12px; + gap: 12px; +} + +.panel-section { + display: grid; + gap: 10px; + padding-bottom: 12px; + border-bottom: 1px solid var(--line); +} + +.report-card h1 { + margin: 0; + font-size: 18px; + line-height: 1.2; +} + +dl { + display: grid; + gap: 7px; + margin: 0; +} + +dl div { + display: grid; + grid-template-columns: 78px minmax(0, 1fr); + gap: 8px; +} + +dt { + color: var(--muted); + font-size: 12px; +} + +dd { + margin: 0; + min-width: 0; + overflow-wrap: anywhere; + font-size: 12px; +} + +.decision-buttons { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 8px; +} + +.status-button[data-status="ok"].active { + background: var(--green); + border-color: var(--green); + color: white; +} + +.status-button[data-status="not_ok"].active { + background: var(--red); + border-color: var(--red); + color: white; +} + +.status-button[data-status="uncertain"].active { + background: var(--amber); + border-color: var(--amber); + color: white; +} + +.subchecks-section label { + display: grid; + grid-template-columns: 1fr 140px; + gap: 8px; + align-items: center; + font-size: 13px; +} + +select, +textarea { + width: 100%; + border: 1px solid var(--line); + border-radius: 6px; + background: white; + color: var(--ink); +} + +select { + min-height: 34px; +} + +textarea { + resize: vertical; + padding: 8px; +} + +.issue-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 8px; +} + +.issue-grid label { + display: flex; + align-items: center; + gap: 6px; + font-size: 12px; +} + +.primary-button { + background: var(--teal); + border-color: var(--teal); + color: white; + font-weight: 800; + flex: 1; +} + +.primary-button:hover { + background: var(--teal-dark); + border-color: var(--teal-dark); +} + +.save-status { + min-height: 20px; +} + +.help-dialog { + width: min(520px, calc(100vw - 32px)); + border: 1px solid var(--line); + border-radius: 8px; + box-shadow: var(--shadow); +} + +.dialog-header { + display: flex; + align-items: center; + justify-content: space-between; + gap: 12px; +} + +.dialog-header h2 { + margin: 0 0 12px; +} + +.shortcut-grid { + display: grid; + grid-template-columns: 90px minmax(0, 1fr); + gap: 8px 14px; +} + +.shortcut-grid span { + font-family: var(--mono); + font-weight: 800; +} + +.shortcut-grid p { + margin: 0; +} + +@media (max-width: 1180px) { + .topbar { + grid-template-columns: 1fr; + } + + .workspace { + height: auto; + min-height: calc(100vh - 82px); + grid-template-columns: 1fr; + } + + .pane { + min-height: 72vh; + } + + .annotation-panel { + min-height: 0; + } +} + +@media (max-width: 620px) { + + .nav-actions, + .pane-toolbar, + .panel-actions { + flex-wrap: wrap; + } + + .decision-buttons, + .issue-grid { + grid-template-columns: 1fr; + } + + .subchecks-section label, + dl div { + grid-template-columns: 1fr; + } +} \ No newline at end of file diff --git a/annotation_OCR/store.py b/annotation_OCR/store.py new file mode 100644 index 0000000..0d85a6b --- /dev/null +++ b/annotation_OCR/store.py @@ -0,0 +1,369 @@ +"""File-backed session storage for OCR annotation runs.""" + +from __future__ import annotations + +import csv +import json +import re +import uuid +from collections import Counter +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +HERE = Path(__file__).resolve().parent +SESSIONS_DIR = HERE / "sessions" +SCHEMA_VERSION = "1.0" + +VALID_OVERALL_STATUS = {"ok", "not_ok", "uncertain", "unreviewed"} + +SUMMARY_FIELDS = [ + "session_id", + "session_name", + "annotator", + "item_id", + "item_kind", + "industry_slug", + "report_name", + "exchange", + "ticker", + "year", + "page_index", + "page_number", + "table_index", + "table_row_count", + "table_col_count", + "overall_status", + "notes", + "updated_at_utc", + "annotation_source", + "review_duration_ms", + "mapping_status", + "mapping_warnings", + "candidate_reasons", + "page_text_sha256", + "raw_png_path", + "mmd_path", + "det_mmd_path", + "focus_bbox", +] + + +def utc_now() -> str: + return datetime.now(timezone.utc).isoformat(timespec="seconds") + + +def session_slug(value: str) -> str: + slug = re.sub(r"[^A-Za-z0-9_.-]+", "-", value.strip()).strip("-._") + return slug[:48] or "session" + + +def new_session_id(session_name: str | None = None) -> str: + prefix = session_slug(session_name or "session")[:24] + return f"{prefix}-{uuid.uuid4().hex[:12]}" + + +def atomic_write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(text, encoding="utf-8") + tmp.replace(path) + + +def atomic_write_json(path: Path, payload: Any) -> None: + atomic_write_text(path, json.dumps(payload, indent=2, ensure_ascii=False)) + + +def session_dir(session_id: str) -> Path: + return SESSIONS_DIR / session_id + + +def metadata_path(session_id: str) -> Path: + return session_dir(session_id) / "metadata.json" + + +def manifest_path(session_id: str) -> Path: + return session_dir(session_id) / "manifest.json" + + +def current_annotations_path(session_id: str) -> Path: + return session_dir(session_id) / "current_annotations.json" + + +def annotations_log_path(session_id: str) -> Path: + return session_dir(session_id) / "annotations.jsonl" + + +def create_session( + *, + session_name: str, + annotator: str, + manifest_items: list[dict[str, Any]], + index_summary: dict[str, Any], + config: dict[str, Any], + session_id: str | None = None, +) -> dict[str, Any]: + sid = session_id or new_session_id(session_name) + directory = session_dir(sid) + if directory.exists(): + raise FileExistsError(f"session already exists: {sid}") + directory.mkdir(parents=True, exist_ok=False) + + now = utc_now() + metadata = { + "schema_version": SCHEMA_VERSION, + "session_id": sid, + "session_name": session_name, + "annotator": annotator, + "created_at_utc": now, + "updated_at_utc": now, + "status": "active", + "item_count": len(manifest_items), + "completed_count": 0, + "index_summary": index_summary, + "config": config, + } + manifest = { + "schema_version": SCHEMA_VERSION, + "session_id": sid, + "created_at_utc": now, + "item_count": len(manifest_items), + "items": manifest_items, + } + + atomic_write_json(metadata_path(sid), metadata) + atomic_write_json(manifest_path(sid), manifest) + atomic_write_json(current_annotations_path(sid), {}) + annotations_log_path(sid).touch() + write_summary_files(sid) + return metadata + + +def load_json(path: Path, default: Any | None = None) -> Any: + if not path.is_file(): + return default + return json.loads(path.read_text(encoding="utf-8")) + + +def load_metadata(session_id: str) -> dict[str, Any]: + metadata = load_json(metadata_path(session_id)) + if metadata is None: + raise FileNotFoundError(f"unknown session: {session_id}") + return metadata + + +def load_manifest(session_id: str) -> list[dict[str, Any]]: + manifest = load_json(manifest_path(session_id)) + if manifest is None: + raise FileNotFoundError(f"unknown session manifest: {session_id}") + return manifest.get("items", []) + + +def load_current_annotations(session_id: str) -> dict[str, dict[str, Any]]: + return load_json(current_annotations_path(session_id), default={}) or {} + + +def list_sessions() -> list[dict[str, Any]]: + if not SESSIONS_DIR.is_dir(): + return [] + sessions: list[dict[str, Any]] = [] + for path in sorted(SESSIONS_DIR.iterdir()): + if not path.is_dir(): + continue + metadata = load_json(path / "metadata.json") + if isinstance(metadata, dict): + sessions.append(metadata) + sessions.sort(key=lambda rec: rec.get("updated_at_utc", ""), reverse=True) + return sessions + + +def manifest_index(session_id: str) -> dict[str, dict[str, Any]]: + return {item["item_id"]: item for item in load_manifest(session_id)} + + +def sanitize_status(value: Any, valid: set[str], default: str) -> str: + if isinstance(value, str) and value in valid: + return value + return default + + +def normalize_annotation_payload(payload: dict[str, Any]) -> dict[str, Any]: + return { + "overall_status": sanitize_status( + payload.get("overall_status"), VALID_OVERALL_STATUS, "unreviewed" + ), + "notes": str(payload.get("notes") or "").strip(), + "annotation_source": str(payload.get("annotation_source") or "manual"), + "review_duration_ms": payload.get("review_duration_ms"), + "client_started_at_utc": payload.get("client_started_at_utc"), + "client_updated_at_utc": payload.get("client_updated_at_utc"), + } + + +def next_log_sequence(path: Path) -> int: + if not path.is_file(): + return 1 + with path.open(encoding="utf-8") as handle: + return sum(1 for line in handle if line.strip()) + 1 + + +def save_annotation( + *, + session_id: str, + item_id: str, + payload: dict[str, Any], +) -> dict[str, Any]: + metadata = load_metadata(session_id) + items = manifest_index(session_id) + item = items.get(item_id) + if item is None: + raise KeyError(f"item not in session manifest: {item_id}") + + normalized = normalize_annotation_payload(payload) + now = utc_now() + log_path = annotations_log_path(session_id) + record = { + "schema_version": SCHEMA_VERSION, + "sequence": next_log_sequence(log_path), + "session_id": session_id, + "session_name": metadata.get("session_name"), + "annotator": metadata.get("annotator"), + "created_at_utc": now, + "updated_at_utc": now, + "item_id": item_id, + "item_kind": item.get("item_kind", "page"), + "industry_slug": item.get("industry_slug"), + "report_name": item.get("report_name"), + "exchange": item.get("exchange"), + "ticker": item.get("ticker"), + "year": item.get("year"), + "page_index": item.get("page_index"), + "page_number": item.get("page_number"), + "table_index": item.get("table_index"), + "table_row_count": item.get("table_row_count"), + "table_col_count": item.get("table_col_count"), + "mmd_path": item.get("mmd_path"), + "det_mmd_path": item.get("det_mmd_path"), + "raw_png_path": item.get("raw_png_path"), + "focus_bbox": item.get("focus_bbox"), + "mapping_status": item.get("mapping_status"), + "mapping_warnings": item.get("mapping_warnings", []), + "candidate_reasons": item.get("candidate_reasons", []), + "page_text_sha256": item.get("page_text_sha256"), + **normalized, + } + + with log_path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(record, ensure_ascii=False) + "\n") + + current = load_current_annotations(session_id) + current[item_id] = record + atomic_write_json(current_annotations_path(session_id), current) + + completed_count = sum( + 1 for rec in current.values() if rec.get("overall_status") != "unreviewed" + ) + metadata["updated_at_utc"] = now + metadata["completed_count"] = completed_count + metadata["item_count"] = len(items) + atomic_write_json(metadata_path(session_id), metadata) + write_summary_files(session_id) + return record + + +def summary_rows(session_id: str) -> list[dict[str, Any]]: + metadata = load_metadata(session_id) + current = load_current_annotations(session_id) + rows: list[dict[str, Any]] = [] + for item in load_manifest(session_id): + annotation = current.get(item["item_id"], {}) + rows.append( + { + "session_id": session_id, + "session_name": metadata.get("session_name", ""), + "annotator": metadata.get("annotator", ""), + "item_id": item.get("item_id"), + "item_kind": item.get("item_kind", "page"), + "industry_slug": item.get("industry_slug"), + "report_name": item.get("report_name"), + "exchange": item.get("exchange"), + "ticker": item.get("ticker"), + "year": item.get("year"), + "page_index": item.get("page_index"), + "page_number": item.get("page_number"), + "table_index": item.get("table_index"), + "table_row_count": item.get("table_row_count"), + "table_col_count": item.get("table_col_count"), + "overall_status": annotation.get("overall_status", "unreviewed"), + "notes": annotation.get("notes", ""), + "updated_at_utc": annotation.get("updated_at_utc", ""), + "annotation_source": annotation.get("annotation_source", ""), + "review_duration_ms": annotation.get("review_duration_ms", ""), + "mapping_status": item.get("mapping_status"), + "mapping_warnings": ";".join(item.get("mapping_warnings", [])), + "candidate_reasons": ";".join(item.get("candidate_reasons", [])), + "page_text_sha256": item.get("page_text_sha256"), + "raw_png_path": item.get("raw_png_path"), + "mmd_path": item.get("mmd_path"), + "det_mmd_path": item.get("det_mmd_path"), + "focus_bbox": json.dumps(item.get("focus_bbox")), + } + ) + return rows + + +def write_summary_csv(path: Path, rows: list[dict[str, Any]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + with tmp.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter( + handle, fieldnames=SUMMARY_FIELDS, extrasaction="ignore" + ) + writer.writeheader() + writer.writerows(rows) + tmp.replace(path) + + +def write_summary_md(path: Path, rows: list[dict[str, Any]]) -> None: + metadata = load_metadata(path.parent.name) + status_counts = Counter(row["overall_status"] for row in rows) + + reviewed = len(rows) - status_counts.get("unreviewed", 0) + lines = [ + f"# OCR Annotation Summary: {metadata.get('session_name', path.parent.name)}", + "", + f"- Session ID: `{path.parent.name}`", + f"- Annotator: `{metadata.get('annotator', '')}`", + f"- Items: {len(rows)}", + f"- Reviewed: {reviewed}", + f"- Updated: {metadata.get('updated_at_utc', '')}", + "", + "## Status Counts", + "", + "| Status | Count |", + "| --- | ---: |", + ] + for status, count in sorted(status_counts.items()): + lines.append(f"| {status} | {count} |") + + atomic_write_text(path, "\n".join(lines) + "\n") + + +def write_summary_files(session_id: str) -> dict[str, str]: + rows = summary_rows(session_id) + directory = session_dir(session_id) + csv_path = directory / "summary.csv" + md_path = directory / "summary.md" + write_summary_csv(csv_path, rows) + write_summary_md(md_path, rows) + return {"summary_csv": str(csv_path), "summary_md": str(md_path)} + + +def write_all_sessions_summary(path: Path | None = None) -> Path: + out_path = path or (SESSIONS_DIR / "all_sessions_summary.csv") + rows: list[dict[str, Any]] = [] + for metadata in list_sessions(): + rows.extend(summary_rows(metadata["session_id"])) + write_summary_csv(out_path, rows) + return out_path diff --git a/annotation_OCR/study_agreement.py b/annotation_OCR/study_agreement.py new file mode 100644 index 0000000..0e73762 --- /dev/null +++ b/annotation_OCR/study_agreement.py @@ -0,0 +1,787 @@ +"""Compute agreement and accept/reject ratios for bundle-backed table studies.""" + +from __future__ import annotations + +import argparse +import csv +import json +from collections import Counter +from dataclasses import dataclass +from datetime import datetime, timezone +from math import comb +from pathlib import Path +from typing import Any + + +HERE = Path(__file__).resolve().parent +DEFAULT_SESSIONS_DIR = HERE / "sessions" +DEFAULT_ANALYSIS_ROOT = DEFAULT_SESSIONS_DIR / "study_analysis" +REVIEWED_STATUSES = ("ok", "not_ok", "uncertain") +VALID_STATUSES = set(REVIEWED_STATUSES).union({"unreviewed"}) + + +@dataclass(slots=True) +class SessionPayload: + session_id: str + session_name: str + annotator: str + slot: int + item_count: int + completed_count: int + updated_at_utc: str + metadata: dict[str, Any] + manifest_items: list[dict[str, Any]] + current_annotations: dict[str, dict[str, Any]] + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Compute agreement metrics and accept/reject ratios for OCR table study sessions." + ) + parser.add_argument( + "--study-bundle", + type=Path, + required=True, + help="Path to the study_sessions_*.json bundle used for the annotation round.", + ) + parser.add_argument( + "--sessions-dir", + type=Path, + default=DEFAULT_SESSIONS_DIR, + help="Directory containing annotation_OCR session folders.", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=None, + help="Output directory for summary artifacts. Defaults to sessions/study_analysis//.", + ) + parser.add_argument( + "--session-id", + dest="session_ids", + nargs="+", + default=None, + help="Optional explicit session ids to analyze. If omitted, all sessions linked to the study bundle are used.", + ) + parser.add_argument( + "--strict-manifest", + action="store_true", + help="Fail if a selected session manifest does not match its bundle slot.", + ) + return parser + + +def utc_now() -> str: + return datetime.now(timezone.utc).isoformat(timespec="seconds") + + +def load_json(path: Path, *, default: Any | None = None) -> Any: + if not path.is_file(): + return default + return json.loads(path.read_text(encoding="utf-8")) + + +def atomic_write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(text, encoding="utf-8") + tmp.replace(path) + + +def atomic_write_json(path: Path, payload: Any) -> None: + atomic_write_text(path, json.dumps(payload, indent=2, ensure_ascii=False) + "\n") + + +def safe_div(numerator: int, denominator: int) -> float | None: + if denominator == 0: + return None + return numerator / denominator + + +def format_ratio(value: float | None) -> str: + if value is None: + return "n/a" + return f"{value * 100:.1f}%" + + +def parse_int(value: Any) -> int | None: + if isinstance(value, int): + return value + if isinstance(value, str) and value.isdigit(): + return int(value) + return None + + +def normalize_status(value: Any) -> str: + if isinstance(value, str) and value in VALID_STATUSES: + return value + return "unreviewed" + + +def load_study_bundle(path: Path) -> dict[str, Any]: + payload = load_json(path) + sessions = payload.get("sessions") if isinstance(payload, dict) else None + if payload.get("bundle_type") != "ocr_table_study_bundle" or not isinstance( + sessions, list + ): + raise ValueError(f"invalid study bundle in {path}") + return payload + + +def build_bundle_index( + bundle: dict[str, Any], +) -> tuple[dict[int, dict[str, Any]], dict[str, dict[str, Any]], list[str]]: + slot_index: dict[int, dict[str, Any]] = {} + item_index: dict[str, dict[str, Any]] = {} + warnings: list[str] = [] + + for session in bundle["sessions"]: + slot = parse_int(session.get("slot")) + items = session.get("items") + if slot is None or not isinstance(items, list): + raise ValueError("invalid study session entry in bundle") + slot_index[slot] = session + for item in items: + item_id = str(item.get("item_id") or "") + if not item_id: + raise ValueError(f"bundle slot {slot} contains an item without item_id") + expected_votes = parse_int(item.get("study_expected_votes")) or 1 + study_assignment = str(item.get("study_assignment") or "single") + record = item_index.setdefault( + item_id, + { + "item_id": item_id, + "industry_slug": item.get("industry_slug"), + "report_name": item.get("report_name"), + "exchange": item.get("exchange"), + "ticker": item.get("ticker"), + "year": item.get("year"), + "page_index": item.get("page_index"), + "page_number": item.get("page_number"), + "table_index": item.get("table_index"), + "table_row_count": item.get("table_row_count"), + "table_col_count": item.get("table_col_count"), + "study_assignment": study_assignment, + "expected_votes": expected_votes, + "assigned_slots": [], + }, + ) + record["assigned_slots"].append(slot) + record["expected_votes"] = max(record["expected_votes"], expected_votes) + if study_assignment == "agreement": + record["study_assignment"] = "agreement" + + for item_id, record in item_index.items(): + assigned_slots = sorted(record["assigned_slots"]) + record["assigned_slots"] = assigned_slots + occurrence_count = len(assigned_slots) + if occurrence_count > 1: + record["study_assignment"] = "agreement" + if occurrence_count != record["expected_votes"]: + warnings.append( + f"bundle item {item_id} appears in {occurrence_count} slots but declares study_expected_votes={record['expected_votes']}" + ) + + return slot_index, item_index, warnings + + +def load_session_payload(sessions_dir: Path, session_id: str) -> SessionPayload: + directory = sessions_dir / session_id + metadata = load_json(directory / "metadata.json") + if not isinstance(metadata, dict): + raise FileNotFoundError(f"missing metadata for session {session_id}") + manifest = load_json(directory / "manifest.json", default={}) or {} + manifest_items = manifest.get("items") if isinstance(manifest, dict) else None + if not isinstance(manifest_items, list): + raise ValueError(f"invalid manifest for session {session_id}") + current_annotations = ( + load_json(directory / "current_annotations.json", default={}) or {} + ) + if not isinstance(current_annotations, dict): + raise ValueError(f"invalid current_annotations for session {session_id}") + + config = metadata.get("config") or {} + slot = parse_int(config.get("study_slot")) + if slot is None: + raise ValueError(f"session {session_id} has no usable study_slot") + + return SessionPayload( + session_id=session_id, + session_name=str(metadata.get("session_name") or session_id), + annotator=str(metadata.get("annotator") or ""), + slot=slot, + item_count=parse_int(metadata.get("item_count")) or len(manifest_items), + completed_count=parse_int(metadata.get("completed_count")) or 0, + updated_at_utc=str(metadata.get("updated_at_utc") or ""), + metadata=metadata, + manifest_items=manifest_items, + current_annotations=current_annotations, + ) + + +def discover_sessions( + *, + sessions_dir: Path, + bundle_path: Path, + session_ids: list[str] | None, +) -> tuple[dict[int, SessionPayload], list[str]]: + bundle_resolved = str(bundle_path.resolve()) + warnings: list[str] = [] + discovered: dict[int, SessionPayload] = {} + + if session_ids is None: + candidate_ids = [ + path.name for path in sorted(sessions_dir.iterdir()) if path.is_dir() + ] + else: + candidate_ids = session_ids + + for session_id in candidate_ids: + metadata = load_json(sessions_dir / session_id / "metadata.json") + if not isinstance(metadata, dict): + if session_ids is None: + continue + raise FileNotFoundError(f"missing metadata for session {session_id}") + + config = metadata.get("config") or {} + session_bundle = config.get("study_bundle_path") + session_slot = parse_int(config.get("study_slot")) + if session_ids is None: + if session_bundle != bundle_resolved or session_slot is None: + continue + + payload = load_session_payload(sessions_dir, session_id) + if session_ids is not None and session_bundle not in {None, bundle_resolved}: + warnings.append( + f"session {session_id} references a different study bundle: {session_bundle}" + ) + + existing = discovered.get(payload.slot) + if existing is None: + discovered[payload.slot] = payload + continue + + keep = payload + drop = existing + if (existing.updated_at_utc, existing.session_id) > ( + payload.updated_at_utc, + payload.session_id, + ): + keep = existing + drop = payload + discovered[payload.slot] = keep + warnings.append( + f"multiple sessions claim study slot {payload.slot}; keeping {keep.session_id} and ignoring {drop.session_id}" + ) + + return discovered, warnings + + +def validate_session_manifest( + *, + payload: SessionPayload, + expected_session: dict[str, Any], + strict: bool, +) -> list[str]: + warnings: list[str] = [] + actual_ids = [str(item.get("item_id") or "") for item in payload.manifest_items] + expected_ids = [ + str(item.get("item_id") or "") for item in expected_session["items"] + ] + if Counter(actual_ids) != Counter(expected_ids): + message = f"session {payload.session_id} manifest does not match bundle slot {payload.slot}" + if strict: + raise ValueError(message) + warnings.append(message) + return warnings + + +def status_ratio_block(counts: Counter[str]) -> dict[str, Any]: + reviewed = sum(counts.get(status, 0) for status in REVIEWED_STATUSES) + decided = counts.get("ok", 0) + counts.get("not_ok", 0) + return { + "reviewed": reviewed, + "decided": decided, + "ok": counts.get("ok", 0), + "not_ok": counts.get("not_ok", 0), + "uncertain": counts.get("uncertain", 0), + "ok_rate_all": safe_div(counts.get("ok", 0), reviewed), + "not_ok_rate_all": safe_div(counts.get("not_ok", 0), reviewed), + "uncertain_rate_all": safe_div(counts.get("uncertain", 0), reviewed), + "accept_ratio_decided": safe_div(counts.get("ok", 0), decided), + "reject_ratio_decided": safe_div(counts.get("not_ok", 0), decided), + } + + +def majority_status(counts: Counter[str], vote_count: int) -> str | None: + if vote_count == 0: + return None + top_count = max(counts.values(), default=0) + if top_count * 2 <= vote_count: + return None + winners = [status for status, count in counts.items() if count == top_count] + if len(winners) != 1: + return None + return winners[0] + + +def compute_pairwise_agreement(item_rows: list[dict[str, Any]]) -> dict[str, Any]: + items_considered = 0 + matching_pairs = 0 + total_pairs = 0 + for row in item_rows: + vote_count = int(row["vote_count"]) + if vote_count < 2: + continue + items_considered += 1 + total_pairs += comb(vote_count, 2) + matching_pairs += sum( + comb(int(row[f"{status}_votes"]), 2) for status in REVIEWED_STATUSES + ) + return { + "items_considered": items_considered, + "pairs_total": total_pairs, + "pairs_matching": matching_pairs, + "agreement_rate": safe_div(matching_pairs, total_pairs), + } + + +def compute_fleiss_kappa(item_rows: list[dict[str, Any]]) -> float | None: + if not item_rows: + return None + n = int(item_rows[0]["expected_votes"]) + if n < 2: + return None + if any(int(row["expected_votes"]) != n for row in item_rows): + return None + + total_items = len(item_rows) + p_i_values: list[float] = [] + category_totals = Counter[str]() + for row in item_rows: + row_total = 0 + squared_sum = 0 + for status in REVIEWED_STATUSES: + count = int(row[f"{status}_votes"]) + category_totals[status] += count + row_total += count + squared_sum += count * count + if row_total != n: + return None + p_i_values.append((squared_sum - n) / (n * (n - 1))) + + p_bar = sum(p_i_values) / total_items + p_e = 0.0 + for status in REVIEWED_STATUSES: + p_j = category_totals[status] / (total_items * n) + p_e += p_j * p_j + if p_e == 1.0: + return None + return (p_bar - p_e) / (1.0 - p_e) + + +def build_analysis( + *, + bundle_path: Path, + sessions_dir: Path, + session_ids: list[str] | None, + strict_manifest: bool, +) -> tuple[dict[str, Any], list[dict[str, Any]], list[dict[str, Any]]]: + bundle = load_study_bundle(bundle_path) + slot_index, item_index, warnings = build_bundle_index(bundle) + selected_sessions, session_warnings = discover_sessions( + sessions_dir=sessions_dir, + bundle_path=bundle_path, + session_ids=session_ids, + ) + warnings.extend(session_warnings) + + expected_slots = sorted(slot_index) + missing_slots = [slot for slot in expected_slots if slot not in selected_sessions] + + session_rows: list[dict[str, Any]] = [] + for slot in expected_slots: + payload = selected_sessions.get(slot) + if payload is None: + continue + warnings.extend( + validate_session_manifest( + payload=payload, + expected_session=slot_index[slot], + strict=strict_manifest, + ) + ) + slot_status_counts = Counter[str]() + for item in slot_index[slot]["items"]: + item_id = str(item["item_id"]) + annotation = payload.current_annotations.get(item_id) or {} + slot_status_counts[normalize_status(annotation.get("overall_status"))] += 1 + reviewed_count = ( + len(slot_index[slot]["items"]) - slot_status_counts["unreviewed"] + ) + if reviewed_count != payload.completed_count: + warnings.append( + f"session {payload.session_id} metadata says completed_count={payload.completed_count} but current_annotations implies {reviewed_count}" + ) + status_block = status_ratio_block(slot_status_counts) + session_rows.append( + { + "slot": slot, + "session_id": payload.session_id, + "session_name": payload.session_name, + "annotator": payload.annotator, + "item_count": len(slot_index[slot]["items"]), + "reviewed_count": reviewed_count, + "unreviewed_count": slot_status_counts["unreviewed"], + "ok": slot_status_counts["ok"], + "not_ok": slot_status_counts["not_ok"], + "uncertain": slot_status_counts["uncertain"], + "accept_ratio_decided": status_block["accept_ratio_decided"], + "reject_ratio_decided": status_block["reject_ratio_decided"], + "uncertain_rate_all": status_block["uncertain_rate_all"], + "updated_at_utc": payload.updated_at_utc, + } + ) + + vote_level_counts_all = Counter[str]() + vote_level_counts_single = Counter[str]() + vote_level_counts_agreement = Counter[str]() + item_rows: list[dict[str, Any]] = [] + + for item_id, record in sorted(item_index.items()): + votes: list[dict[str, Any]] = [] + missing_session_slots_for_item: list[int] = [] + unreviewed_slots: list[int] = [] + available_slots: list[int] = [] + + for slot in record["assigned_slots"]: + payload = selected_sessions.get(slot) + if payload is None: + missing_session_slots_for_item.append(slot) + continue + available_slots.append(slot) + annotation = payload.current_annotations.get(item_id) or {} + status = normalize_status(annotation.get("overall_status")) + if status == "unreviewed": + unreviewed_slots.append(slot) + continue + vote = { + "slot": slot, + "session_id": payload.session_id, + "annotator": payload.annotator, + "overall_status": status, + "updated_at_utc": annotation.get("updated_at_utc", ""), + } + votes.append(vote) + vote_level_counts_all[status] += 1 + if record["study_assignment"] == "agreement": + vote_level_counts_agreement[status] += 1 + else: + vote_level_counts_single[status] += 1 + + vote_counts = Counter(vote["overall_status"] for vote in votes) + vote_count = len(votes) + majority = majority_status(vote_counts, vote_count) + is_complete = vote_count == int(record["expected_votes"]) + is_unanimous = is_complete and len(vote_counts) == 1 + final_status = None + if record["study_assignment"] == "single" and vote_count == 1: + final_status = votes[0]["overall_status"] + elif record["study_assignment"] == "agreement" and is_complete and majority: + final_status = majority + + item_rows.append( + { + "item_id": item_id, + "study_assignment": record["study_assignment"], + "expected_votes": int(record["expected_votes"]), + "assigned_slots": json.dumps(record["assigned_slots"]), + "available_slots": json.dumps(available_slots), + "missing_session_slots": json.dumps(missing_session_slots_for_item), + "unreviewed_slots": json.dumps(unreviewed_slots), + "vote_count": vote_count, + "ok_votes": vote_counts["ok"], + "not_ok_votes": vote_counts["not_ok"], + "uncertain_votes": vote_counts["uncertain"], + "is_complete": is_complete, + "is_unanimous": is_unanimous, + "has_majority": majority is not None, + "majority_status": majority or "", + "final_status": final_status or "", + "votes_json": json.dumps(votes, ensure_ascii=False), + "industry_slug": record.get("industry_slug"), + "report_name": record.get("report_name"), + "exchange": record.get("exchange"), + "ticker": record.get("ticker"), + "year": record.get("year"), + "page_index": record.get("page_index"), + "page_number": record.get("page_number"), + "table_index": record.get("table_index"), + "table_row_count": record.get("table_row_count"), + "table_col_count": record.get("table_col_count"), + } + ) + + agreement_rows = [ + row for row in item_rows if row["study_assignment"] == "agreement" + ] + complete_agreement_rows = [row for row in agreement_rows if row["is_complete"]] + agreement_rows_with_2plus_votes = [ + row for row in agreement_rows if int(row["vote_count"]) >= 2 + ] + unanimous_rows = [row for row in complete_agreement_rows if row["is_unanimous"]] + majority_rows = [ + row + for row in complete_agreement_rows + if row["has_majority"] and not row["is_unanimous"] + ] + no_majority_rows = [ + row for row in complete_agreement_rows if not row["has_majority"] + ] + + final_status_counts = Counter( + row["final_status"] for row in item_rows if row["final_status"] + ) + agreement_final_counts = Counter( + row["majority_status"] + for row in complete_agreement_rows + if row["majority_status"] + ) + + summary = { + "analysis_completed_at_utc": utc_now(), + "study_bundle_path": str(bundle_path.resolve()), + "sessions_dir": str(sessions_dir.resolve()), + "bundle": { + "annotator_count": bundle.get("annotator_count"), + "required_votes": bundle.get("required_votes"), + "summary": bundle.get("summary") or {}, + }, + "session_coverage": { + "expected_slots": expected_slots, + "sessions_found": len(selected_sessions), + "missing_slots": missing_slots, + }, + "annotation_votes": { + "all": status_ratio_block(vote_level_counts_all), + "single": status_ratio_block(vote_level_counts_single), + "agreement": status_ratio_block(vote_level_counts_agreement), + }, + "agreement": { + "tables_total": len(agreement_rows), + "tables_with_any_vote": sum( + 1 for row in agreement_rows if row["vote_count"] > 0 + ), + "tables_with_2plus_votes": len(agreement_rows_with_2plus_votes), + "tables_complete": len(complete_agreement_rows), + "unanimous_tables": len(unanimous_rows), + "mixed_majority_tables": len(majority_rows), + "no_majority_tables": len(no_majority_rows), + "exact_agreement_rate": safe_div( + len(unanimous_rows), len(complete_agreement_rows) + ), + "complete_pairwise": compute_pairwise_agreement(complete_agreement_rows), + "partial_pairwise": compute_pairwise_agreement( + agreement_rows_with_2plus_votes + ), + "fleiss_kappa": compute_fleiss_kappa(complete_agreement_rows), + "majority_status_counts": dict(agreement_final_counts), + "majority_status_ratios": status_ratio_block(agreement_final_counts), + }, + "final_table_decisions": { + "tables_with_final_status": sum(final_status_counts.values()), + "status_counts": dict(final_status_counts), + "status_ratios": status_ratio_block(final_status_counts), + }, + "warnings": warnings, + } + return summary, session_rows, item_rows + + +def write_csv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + with tmp.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames, extrasaction="ignore") + writer.writeheader() + writer.writerows(rows) + tmp.replace(path) + + +def render_summary_markdown( + *, summary: dict[str, Any], session_rows: list[dict[str, Any]] +) -> str: + session_coverage = summary["session_coverage"] + all_votes = summary["annotation_votes"]["all"] + agreement = summary["agreement"] + final_tables = summary["final_table_decisions"] + warnings = summary.get("warnings") or [] + + lines = [ + "# OCR Table Study Agreement Summary", + "", + f"- Generated: {summary['analysis_completed_at_utc']}", + f"- Study bundle: {summary['study_bundle_path']}", + f"- Sessions directory: {summary['sessions_dir']}", + f"- Sessions found: {session_coverage['sessions_found']} / {len(session_coverage['expected_slots'])}", + f"- Missing slots: {', '.join(str(slot) for slot in session_coverage['missing_slots']) or 'none'}", + "", + "## Vote-Level Ratios", + "", + f"- Reviewed votes: {all_votes['reviewed']}", + f"- Accept rate among all reviewed votes: {format_ratio(all_votes['ok_rate_all'])}", + f"- Reject rate among all reviewed votes: {format_ratio(all_votes['not_ok_rate_all'])}", + f"- Uncertain rate among all reviewed votes: {format_ratio(all_votes['uncertain_rate_all'])}", + f"- Accept ratio among decided votes: {format_ratio(all_votes['accept_ratio_decided'])}", + f"- Reject ratio among decided votes: {format_ratio(all_votes['reject_ratio_decided'])}", + "", + "## Agreement Subset", + "", + f"- Agreement tables total: {agreement['tables_total']}", + f"- Agreement tables with 2+ votes: {agreement['tables_with_2plus_votes']}", + f"- Agreement tables complete: {agreement['tables_complete']}", + f"- Exact agreement rate: {format_ratio(agreement['exact_agreement_rate'])}", + f"- Complete pairwise agreement: {format_ratio(agreement['complete_pairwise']['agreement_rate'])}", + f"- Partial pairwise agreement: {format_ratio(agreement['partial_pairwise']['agreement_rate'])}", + f"- Fleiss' kappa: {agreement['fleiss_kappa']:.4f}" + if agreement["fleiss_kappa"] is not None + else "- Fleiss' kappa: n/a", + f"- Unanimous tables: {agreement['unanimous_tables']}", + f"- Mixed-majority tables: {agreement['mixed_majority_tables']}", + f"- No-majority tables: {agreement['no_majority_tables']}", + "", + "## Final Table Decisions", + "", + f"- Tables with a final status: {final_tables['tables_with_final_status']}", + f"- Accept rate at table level: {format_ratio(final_tables['status_ratios']['ok_rate_all'])}", + f"- Reject rate at table level: {format_ratio(final_tables['status_ratios']['not_ok_rate_all'])}", + f"- Uncertain rate at table level: {format_ratio(final_tables['status_ratios']['uncertain_rate_all'])}", + f"- Accept ratio among decided tables: {format_ratio(final_tables['status_ratios']['accept_ratio_decided'])}", + f"- Reject ratio among decided tables: {format_ratio(final_tables['status_ratios']['reject_ratio_decided'])}", + "", + "## Session Breakdown", + "", + "| Slot | Session ID | Annotator | Reviewed | OK | Not OK | Uncertain | Accept Ratio | Reject Ratio |", + "| ---: | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + + for row in sorted(session_rows, key=lambda item: item["slot"]): + lines.append( + "| " + + " | ".join( + [ + str(row["slot"]), + row["session_id"], + row["annotator"], + str(row["reviewed_count"]), + str(row["ok"]), + str(row["not_ok"]), + str(row["uncertain"]), + format_ratio(row["accept_ratio_decided"]), + format_ratio(row["reject_ratio_decided"]), + ] + ) + + " |" + ) + + if warnings: + lines.extend(["", "## Warnings", ""]) + for warning in warnings: + lines.append(f"- {warning}") + + return "\n".join(lines) + "\n" + + +def main(argv: list[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + output_dir = args.output_dir or (DEFAULT_ANALYSIS_ROOT / args.study_bundle.stem) + + summary, session_rows, item_rows = build_analysis( + bundle_path=args.study_bundle, + sessions_dir=args.sessions_dir, + session_ids=args.session_ids, + strict_manifest=args.strict_manifest, + ) + + atomic_write_json(output_dir / "summary.json", summary) + atomic_write_text( + output_dir / "summary.md", + render_summary_markdown(summary=summary, session_rows=session_rows), + ) + write_csv( + output_dir / "session_metrics.csv", + session_rows, + fieldnames=[ + "slot", + "session_id", + "session_name", + "annotator", + "item_count", + "reviewed_count", + "unreviewed_count", + "ok", + "not_ok", + "uncertain", + "accept_ratio_decided", + "reject_ratio_decided", + "uncertain_rate_all", + "updated_at_utc", + ], + ) + write_csv( + output_dir / "item_metrics.csv", + item_rows, + fieldnames=[ + "item_id", + "study_assignment", + "expected_votes", + "assigned_slots", + "available_slots", + "missing_session_slots", + "unreviewed_slots", + "vote_count", + "ok_votes", + "not_ok_votes", + "uncertain_votes", + "is_complete", + "is_unanimous", + "has_majority", + "majority_status", + "final_status", + "votes_json", + "industry_slug", + "report_name", + "exchange", + "ticker", + "year", + "page_index", + "page_number", + "table_index", + "table_row_count", + "table_col_count", + ], + ) + + print(f"Wrote study analysis to {output_dir}") + print( + json.dumps( + { + "sessions_found": summary["session_coverage"]["sessions_found"], + "agreement_tables_complete": summary["agreement"]["tables_complete"], + "exact_agreement_rate": summary["agreement"]["exact_agreement_rate"], + "final_table_accept_ratio": summary["final_table_decisions"][ + "status_ratios" + ]["accept_ratio_decided"], + "final_table_reject_ratio": summary["final_table_decisions"][ + "status_ratios" + ]["reject_ratio_decided"], + "warnings": len(summary.get("warnings") or []), + }, + indent=2, + ) + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/annotation_OCR/study_sessions.py b/annotation_OCR/study_sessions.py new file mode 100644 index 0000000..731c5af --- /dev/null +++ b/annotation_OCR/study_sessions.py @@ -0,0 +1,277 @@ +"""Build balanced table-study session bundles from a base table manifest.""" + +from __future__ import annotations + +import argparse +import json +import random +from pathlib import Path +from typing import Any + + +HERE = Path(__file__).resolve().parent +DEFAULT_SOURCE_MANIFEST = HERE / "manifests" / "tables_5000.json" +DEFAULT_OUTPUT_DIR = HERE / "manifests" + +DEFAULT_TOTAL_TABLES = 1200 +DEFAULT_MIN_SESSION_ITEMS = 100 +DEFAULT_MAX_SESSION_ITEMS = 140 +DEFAULT_REQUIRED_VOTES = 3 +DEFAULT_OVERLAP_BY_ANNOTATORS = { + 13: 250, + 14: 300, + 15: 300, + 16: 300, + 17: 300, +} + + +def load_manifest_items(path: Path) -> list[dict[str, Any]]: + payload = json.loads(path.read_text(encoding="utf-8")) + items = payload.get("items") + if not isinstance(items, list): + raise ValueError(f"invalid manifest items in {path}") + return items + + +def balanced_counts(total: int, buckets: int) -> list[int]: + base, remainder = divmod(total, buckets) + return [base + (1 if index < remainder else 0) for index in range(buckets)] + + +def pick_study_tables( + items: list[dict[str, Any]], *, total_tables: int, seed: int +) -> list[dict[str, Any]]: + if total_tables > len(items): + raise ValueError( + f"requested {total_tables} tables from manifest with only {len(items)} items" + ) + rng = random.Random(seed) + selected = rng.sample(items, total_tables) + rng.shuffle(selected) + return selected + + +def choose_overlap_sessions( + *, overlap_items: list[dict[str, Any]], overlap_counts: list[int], seed: int +) -> list[list[dict[str, Any]]]: + rng = random.Random(seed) + remaining = overlap_counts[:] + assignments: list[list[dict[str, Any]]] = [[] for _ in overlap_counts] + + for item in overlap_items: + eligible = [index for index, count in enumerate(remaining) if count > 0] + if len(eligible) < DEFAULT_REQUIRED_VOTES: + raise ValueError( + "not enough session capacity left for agreement assignment" + ) + rng.shuffle(eligible) + eligible.sort(key=lambda index: remaining[index], reverse=True) + chosen = eligible[:DEFAULT_REQUIRED_VOTES] + for session_index in chosen: + assignments[session_index].append(item) + remaining[session_index] -= 1 + + if any(value != 0 for value in remaining): + raise ValueError("failed to exhaust overlap assignment capacities") + + return assignments + + +def build_session_items( + *, + selected_items: list[dict[str, Any]], + annotator_count: int, + overlap_tables: int, + seed: int, + min_session_items: int, + max_session_items: int, +) -> dict[str, Any]: + total_tables = len(selected_items) + if overlap_tables > total_tables: + raise ValueError("overlap table count cannot exceed selected tables") + + total_annotations = total_tables + (DEFAULT_REQUIRED_VOTES - 1) * overlap_tables + session_sizes = balanced_counts(total_annotations, annotator_count) + if any( + size < min_session_items or size > max_session_items for size in session_sizes + ): + raise ValueError( + f"cannot distribute {total_annotations} annotations across {annotator_count} sessions " + f"inside [{min_session_items}, {max_session_items}]" + ) + + overlap_items = selected_items[:overlap_tables] + unique_items = selected_items[overlap_tables:] + overlap_counts = balanced_counts( + overlap_tables * DEFAULT_REQUIRED_VOTES, annotator_count + ) + overlap_assignments = choose_overlap_sessions( + overlap_items=overlap_items, + overlap_counts=overlap_counts, + seed=seed + annotator_count, + ) + + unique_counts = [ + session_sizes[index] - len(overlap_assignments[index]) + for index in range(annotator_count) + ] + if sum(unique_counts) != len(unique_items): + raise ValueError("unique assignment counts do not match remaining tables") + + rng = random.Random(seed + 1000 + annotator_count) + unique_pool = list(unique_items) + rng.shuffle(unique_pool) + + sessions: list[dict[str, Any]] = [] + cursor = 0 + for session_index in range(annotator_count): + agreement_records = [ + { + **dict(item), + "study_assignment": "agreement", + "study_expected_votes": DEFAULT_REQUIRED_VOTES, + "study_session_slot": session_index + 1, + } + for item in overlap_assignments[session_index] + ] + unique_records = [] + for _ in range(unique_counts[session_index]): + item = unique_pool[cursor] + cursor += 1 + unique_records.append( + { + **dict(item), + "study_assignment": "single", + "study_expected_votes": 1, + "study_session_slot": session_index + 1, + } + ) + + manifest_items = agreement_records + unique_records + rng.shuffle(manifest_items) + sessions.append( + { + "slot": session_index + 1, + "target_items": len(manifest_items), + "agreement_items": len(agreement_records), + "single_items": len(unique_records), + "items": manifest_items, + } + ) + + return { + "annotator_count": annotator_count, + "session_item_counts": [session["target_items"] for session in sessions], + "overlap_tables": overlap_tables, + "unique_tables": total_tables, + "total_annotations": total_annotations, + "sessions": sessions, + } + + +def build_study_bundle( + *, + source_manifest_path: Path, + annotator_count: int, + overlap_tables: int, + total_tables: int, + seed: int, + min_session_items: int, + max_session_items: int, +) -> dict[str, Any]: + items = load_manifest_items(source_manifest_path) + selected = pick_study_tables(items, total_tables=total_tables, seed=seed) + session_payload = build_session_items( + selected_items=selected, + annotator_count=annotator_count, + overlap_tables=overlap_tables, + seed=seed, + min_session_items=min_session_items, + max_session_items=max_session_items, + ) + return { + "bundle_type": "ocr_table_study_bundle", + "source_manifest_path": str(source_manifest_path), + "seed": seed, + "annotator_count": annotator_count, + "required_votes": DEFAULT_REQUIRED_VOTES, + "min_session_items": min_session_items, + "max_session_items": max_session_items, + "summary": { + "annotator_count": annotator_count, + "unique_tables": session_payload["unique_tables"], + "agreement_tables": session_payload["overlap_tables"], + "total_annotations": session_payload["total_annotations"], + "session_item_counts": session_payload["session_item_counts"], + }, + "sessions": session_payload["sessions"], + } + + +def write_bundle(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(payload, indent=2), encoding="utf-8") + tmp.replace(path) + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Build OCR table-study session bundles." + ) + parser.add_argument("--source-manifest", type=Path, default=DEFAULT_SOURCE_MANIFEST) + parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--total-tables", type=int, default=DEFAULT_TOTAL_TABLES) + parser.add_argument( + "--min-session-items", type=int, default=DEFAULT_MIN_SESSION_ITEMS + ) + parser.add_argument( + "--max-session-items", type=int, default=DEFAULT_MAX_SESSION_ITEMS + ) + parser.add_argument( + "--annotators", + type=int, + nargs="+", + default=sorted(DEFAULT_OVERLAP_BY_ANNOTATORS), + help="Annotator counts to build bundles for, e.g. --annotators 14 15 16", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + for annotator_count in args.annotators: + if annotator_count not in DEFAULT_OVERLAP_BY_ANNOTATORS: + raise ValueError( + f"no default overlap setting for annotator count {annotator_count}" + ) + overlap_tables = DEFAULT_OVERLAP_BY_ANNOTATORS[annotator_count] + bundle = build_study_bundle( + source_manifest_path=args.source_manifest, + annotator_count=annotator_count, + overlap_tables=overlap_tables, + total_tables=args.total_tables, + seed=args.seed, + min_session_items=args.min_session_items, + max_session_items=args.max_session_items, + ) + output_path = args.output_dir / f"study_sessions_{annotator_count}.json" + write_bundle(output_path, bundle) + print( + json.dumps( + { + "annotator_count": annotator_count, + "overlap_tables": overlap_tables, + "output": str(output_path), + **bundle["summary"], + }, + indent=2, + ) + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/annotation_OCR/summarize.py b/annotation_OCR/summarize.py new file mode 100644 index 0000000..31d565d --- /dev/null +++ b/annotation_OCR/summarize.py @@ -0,0 +1,58 @@ +"""Regenerate OCR annotation session summaries.""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from store import list_sessions, write_all_sessions_summary, write_summary_files + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Regenerate OCR annotation summaries.") + parser.add_argument("--session-id", action="append", default=[]) + parser.add_argument( + "--all", + action="store_true", + help="Regenerate summaries for every session under annotation_OCR/sessions.", + ) + parser.add_argument( + "--combined-output", + type=Path, + default=None, + help="Optional path for the combined all-sessions CSV.", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + session_ids = list(args.session_id) + if args.all: + session_ids.extend(metadata["session_id"] for metadata in list_sessions()) + + seen = set() + regenerated = [] + for session_id in session_ids: + if session_id in seen: + continue + seen.add(session_id) + regenerated.append( + {"session_id": session_id, **write_summary_files(session_id)} + ) + + combined = None + if args.all or args.combined_output: + combined = str(write_all_sessions_summary(args.combined_output)) + + print( + json.dumps( + {"regenerated": regenerated, "combined_summary_csv": combined}, indent=2 + ) + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/annotation_OCR/templates/index.html b/annotation_OCR/templates/index.html new file mode 100644 index 0000000..bc13f29 --- /dev/null +++ b/annotation_OCR/templates/index.html @@ -0,0 +1,155 @@ + + + + + + + OCR Annotation + + + + + + +
+
+
OCR annotation
+
Loading session
+
+
+ +
+
0 / 0 reviewed
+ +
+ + +
+ +
+
+
+
+
Raw page
+
+
+
+ + + + +
+
+
+
+ Raw OCR source page + +
+ +
+
+ +
+
+
+
Extracted content
+
+
+
+ + +
+
+
+ +
+ + +
+ + +
+
+

Keyboard

+ +
+
+ A +

Yes, save, next

+ R +

No, save, next

+ U +

Uncertain, save, next

+ J / K +

Next / previous

+ + / - / 0 +

Zoom

+ F +

Refocus on the table

+
+
+
+ + + \ No newline at end of file diff --git a/annotation_OCR/templates/landing.html b/annotation_OCR/templates/landing.html new file mode 100644 index 0000000..5a69e0d --- /dev/null +++ b/annotation_OCR/templates/landing.html @@ -0,0 +1,263 @@ + + + + + + + OCR Annotation — Start + + + + + +
+

OCR Annotation

+

Enter your name to start a new annotation session, or resume an existing one below.

+ +
+

New session

+
+
+ + +
+
+ + +
+ +
+ +
+ +
+

Resume existing session

+
    +
  • Loading…
  • +
+
+
+ + + + + \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index cf0bb03..0d39bc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,9 @@ description = "Add your description here" readme = "README.md" requires-python = ">=3.13" dependencies = [ + "bleach>=6.3.0", + "flask>=3.1.3", + "markdown>=3.10.2", "openai>=2.33.0", "pydantic>=2.13.3", "tqdm>=4.67.3", diff --git a/scripts/fix_broken_dollar_overlap.py b/scripts/fix_broken_dollar_overlap.py new file mode 100644 index 0000000..f3ae8b4 --- /dev/null +++ b/scripts/fix_broken_dollar_overlap.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +r"""Replace broken dollar markers in .mmd files using heuristic-based selection. + +Heuristic A (pair-based): +- Adjacent marker pair "\\(" then "\\)" with no curly braces between them. + +Heuristic B (money-context): +- Marker appears to precede an amount-like token or nearby money phrasing. +- Excludes obvious math-like markup such as "\\( _{2}" and "\\( ^{TM}". + +Selection strategies: +- money: use only money-context markers (higher recall; default). +- overlap: use intersection of pair-based and money-context markers (higher precision). + +Always-on exact rule: +- Replace exact table cell markers "\(" and "\)". +""" + +from __future__ import annotations + +import argparse +import re +from pathlib import Path +from typing import Iterable + +MARKER_RE = re.compile(r"\\\(|\\\)") +MONEY_NUM_RE = re.compile(r"^\s*[\(\-]?\d(?:[\d,]*\.?\d*)") +MONEY_WORD_RE = re.compile(r"^.{0,24}\b(?:million|billion|thousand)\b", re.IGNORECASE) +MONEY_PHRASE_RE = re.compile( + r"^.{0,30}\b(?:per\s+share|per\s+ton|per\s+gallon|per\s+bushel|market\s+value)\b", + re.IGNORECASE, +) +MATHISH_RE = re.compile(r"^\s*[_\^]?\s*\{") +EXACT_TD_RE = re.compile(r"(\\\(|\\\))") + + +def iter_mmd_files(root: Path) -> Iterable[Path]: + for path in root.rglob("*.mmd"): + if path.is_file(): + yield path + + +def get_markers(text: str) -> list[tuple[int, str]]: + return [(m.start(), m.group(0)) for m in MARKER_RE.finditer(text)] + + +def select_user_markers(text: str, markers: list[tuple[int, str]]) -> set[int]: + selected: set[int] = set() + for i in range(len(markers) - 1): + pos_a, tok_a = markers[i] + pos_b, tok_b = markers[i + 1] + if tok_a != r"\(" or tok_b != r"\)": + continue + between = text[pos_a + 2 : pos_b] + if "{" in between or "}" in between: + continue + selected.add(pos_a) + selected.add(pos_b) + return selected + + +def select_money_context_markers(text: str, markers: list[tuple[int, str]]) -> set[int]: + selected: set[int] = set() + for pos, _tok in markers: + after = text[pos + 2 : pos + 66] + + # Exclude obvious math-like constructions: \( _{...}, \( ^{...}, \({ ... + if MATHISH_RE.match(after): + continue + + is_money = bool( + MONEY_NUM_RE.match(after) + or MONEY_WORD_RE.match(after) + or MONEY_PHRASE_RE.match(after) + ) + if is_money: + selected.add(pos) + return selected + + +def select_exact_td_markers(text: str) -> set[int]: + # Capture the marker token position inside exact HTML cells like \(. + return {m.start(1) for m in EXACT_TD_RE.finditer(text)} + + +def apply_replacements( + text: str, markers: list[tuple[int, str]], positions: set[int] +) -> tuple[str, int]: + if not positions: + return text, 0 + + out: list[str] = [] + cursor = 0 + replaced = 0 + + for pos, _tok in markers: + if pos in positions: + out.append(text[cursor:pos]) + out.append("$") + cursor = pos + 2 + replaced += 1 + + out.append(text[cursor:]) + return "".join(out), replaced + + +def process_file(path: Path, dry_run: bool, strategy: str) -> dict[str, int]: + text = path.read_text(encoding="utf-8") + markers = get_markers(text) + + user_positions = select_user_markers(text, markers) + money_positions = select_money_context_markers(text, markers) + overlap = user_positions & money_positions + td_exact_positions = select_exact_td_markers(text) + + if strategy == "money": + selected_positions = money_positions | td_exact_positions + elif strategy == "overlap": + selected_positions = overlap | td_exact_positions + else: + raise ValueError(f"Unknown strategy: {strategy}") + + updated_text, replaced = apply_replacements(text, markers, selected_positions) + + changed = int(replaced > 0) + if replaced > 0 and not dry_run: + path.write_text(updated_text, encoding="utf-8") + + return { + "markers": len(markers), + "user": len(user_positions), + "money": len(money_positions), + "overlap": len(overlap), + "td_exact": len(td_exact_positions), + "replaced": replaced, + "changed": changed, + } + + +def main() -> int: + parser = argparse.ArgumentParser( + description=( + "Replace broken dollar markers in .mmd files using heuristic-based " + "selection." + ) + ) + parser.add_argument( + "directory", type=Path, help="Root directory to scan recursively" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Compute and report changes without writing files", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Print per-file replacement counts", + ) + parser.add_argument( + "--strategy", + choices=("money", "overlap"), + default="money", + help=( + "Replacement selection strategy: 'money' (higher recall, default) " + "or 'overlap' (higher precision)." + ), + ) + args = parser.parse_args() + + root = args.directory + if not root.exists() or not root.is_dir(): + raise SystemExit(f"Directory not found or not a directory: {root}") + + totals = { + "files": 0, + "markers": 0, + "user": 0, + "money": 0, + "overlap": 0, + "td_exact": 0, + "replaced": 0, + "changed": 0, + } + + for path in iter_mmd_files(root): + stats = process_file(path, dry_run=args.dry_run, strategy=args.strategy) + totals["files"] += 1 + totals["markers"] += stats["markers"] + totals["user"] += stats["user"] + totals["money"] += stats["money"] + totals["overlap"] += stats["overlap"] + totals["td_exact"] += stats["td_exact"] + totals["replaced"] += stats["replaced"] + totals["changed"] += stats["changed"] + + if args.verbose and stats["replaced"] > 0: + print(f"{path}: replacements={stats['replaced']}") + + mode = "DRY RUN" if args.dry_run else "APPLY" + print(f"MODE={mode}") + print(f"STRATEGY={args.strategy}") + print(f"FILES_SCANNED={totals['files']}") + print(f"TOTAL_MARKER_TOKENS={totals['markers']}") + print(f"USER_HEURISTIC_TOTAL={totals['user']}") + print(f"MONEY_HEURISTIC_TOTAL={totals['money']}") + print(f"OVERLAP_TOTAL={totals['overlap']}") + print(f"EXACT_TD_TOTAL={totals['td_exact']}") + print(f"REPLACEMENTS={totals['replaced']}") + print(f"FILES_CHANGED={totals['changed']}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/uv.lock b/uv.lock index c0651b8..869ffcd 100644 --- a/uv.lock +++ b/uv.lock @@ -34,6 +34,9 @@ name = "ardian-dataset-bench" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "bleach" }, + { name = "flask" }, + { name = "markdown" }, { name = "openai" }, { name = "pydantic" }, { name = "tqdm" }, @@ -42,6 +45,9 @@ dependencies = [ [package.metadata] requires-dist = [ + { name = "bleach", specifier = ">=6.3.0" }, + { name = "flask", specifier = ">=3.1.3" }, + { name = "markdown", specifier = ">=3.10.2" }, { name = "openai", specifier = ">=2.33.0" }, { name = "pydantic", specifier = ">=2.13.3" }, { name = "tqdm", specifier = ">=4.67.3" }, @@ -61,6 +67,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, ] +[[package]] +name = "bleach" +version = "6.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "webencodings" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/07/18/3c8523962314be6bf4c8989c79ad9531c825210dd13a8669f6b84336e8bd/bleach-6.3.0.tar.gz", hash = "sha256:6f3b91b1c0a02bb9a78b5a454c92506aa0fdf197e1d5e114d2e00c6f64306d22", size = 203533, upload-time = "2025-10-27T17:57:39.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" }, +] + +[[package]] +name = "blinker" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, +] + [[package]] name = "certifi" version = "2026.4.22" @@ -172,6 +199,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" }, ] +[[package]] +name = "click" +version = "8.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/98/518d8e5081007684232226f475082b30087d0f585e8457db087298259f49/click-8.4.1.tar.gz", hash = "sha256:918b5633eddf6b41c32d4f454bf0de810065c74e3f7dbf8ee5452f8be88d3e96", size = 353007, upload-time = "2026-05-22T04:08:37.769Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/0d/67e5b4109ea4a837e80daa87c2c696711955e40449a97e8926672534def2/click-8.4.1-py3-none-any.whl", hash = "sha256:482be17c6991b8c19c5429a1e995d9b0efdbb63172824c41f99965dc0ade8ec2", size = 116639, upload-time = "2026-05-22T04:08:35.26Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -223,6 +262,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, ] +[[package]] +name = "flask" +version = "3.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "blinker" }, + { name = "click" }, + { name = "itsdangerous" }, + { name = "jinja2" }, + { name = "markupsafe" }, + { name = "werkzeug" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" }, +] + [[package]] name = "frozendict" version = "2.4.7" @@ -278,6 +334,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl", hash = "sha256:892ea0cde124a99ce773decba204c5552b69c3c67ffd5f232eb7696135bc8bb3", size = 68629, upload-time = "2026-04-22T16:42:40.909Z" }, ] +[[package]] +name = "itsdangerous" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + [[package]] name = "jiter" version = "0.14.0" @@ -332,6 +409,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/2e/a44c20c58aeed0355f2d326969a181696aeb551a25195f47563908a815be/jiter-0.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:5419d4aa2024961da9fe12a9cfe7484996735dca99e8e090b5c88595ef1951ff", size = 191338, upload-time = "2026-04-10T14:28:02.853Z" }, ] +[[package]] +name = "markdown" +version = "3.10.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" }, +] + [[package]] name = "markdown-it-py" version = "4.0.0" @@ -344,6 +430,58 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, ] +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" }, + { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" }, + { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" }, + { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" }, + { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" }, + { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" }, + { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" }, + { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" }, + { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" }, + { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" }, + { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" }, + { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" }, + { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" }, + { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" }, + { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" }, + { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" }, + { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" }, + { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" }, + { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" }, + { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" }, + { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" }, + { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" }, + { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" }, + { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" }, + { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" }, + { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" }, + { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" }, + { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" }, + { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" }, + { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" }, + { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" }, + { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" }, + { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" }, + { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" }, + { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" }, + { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" }, + { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" }, + { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" }, + { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, + { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, + { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, +] + [[package]] name = "mdurl" version = "0.1.2" @@ -698,6 +836,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, ] +[[package]] +name = "webencodings" +version = "0.5.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/02/ae6ceac1baeda530866a85075641cec12989bd8d31af6d5ab4a3e8c92f47/webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923", size = 9721, upload-time = "2017-04-05T20:21:34.189Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774, upload-time = "2017-04-05T20:21:32.581Z" }, +] + [[package]] name = "websockets" version = "16.0" @@ -734,6 +881,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" }, ] +[[package]] +name = "werkzeug" +version = "3.1.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dd/b2/381be8cfdee792dd117872481b6e378f85c957dd7c5bca38897b08f765fd/werkzeug-3.1.8.tar.gz", hash = "sha256:9bad61a4268dac112f1c5cd4630a56ede601b6ed420300677a869083d70a4c44", size = 875852, upload-time = "2026-04-02T18:49:14.268Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/93/8c/2e650f2afeb7ee576912636c23ddb621c91ac6a98e66dc8d29c3c69446e1/werkzeug-3.1.8-py3-none-any.whl", hash = "sha256:63a77fb8892bf28ebc3178683445222aa500e48ebad5ec77b0ad80f8726b1f50", size = 226459, upload-time = "2026-04-02T18:49:12.72Z" }, +] + [[package]] name = "yfinance" version = "1.3.0"