From 69e81b574dc99774215bd25bf287fd9f58b2672d Mon Sep 17 00:00:00 2001 From: Charles Moslonka Date: Fri, 22 May 2026 11:54:30 +0200 Subject: [PATCH 1/8] MAINT: add web deps --- pyproject.toml | 3 + uv.lock | 159 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index cf0bb03..0d39bc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,9 @@ description = "Add your description here" readme = "README.md" requires-python = ">=3.13" dependencies = [ + "bleach>=6.3.0", + "flask>=3.1.3", + "markdown>=3.10.2", "openai>=2.33.0", "pydantic>=2.13.3", "tqdm>=4.67.3", diff --git a/uv.lock b/uv.lock index c0651b8..869ffcd 100644 --- a/uv.lock +++ b/uv.lock @@ -34,6 +34,9 @@ name = "ardian-dataset-bench" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "bleach" }, + { name = "flask" }, + { name = "markdown" }, { name = "openai" }, { name = "pydantic" }, { name = "tqdm" }, @@ -42,6 +45,9 @@ dependencies = [ [package.metadata] requires-dist = [ + { name = "bleach", specifier = ">=6.3.0" }, + { name = "flask", specifier = ">=3.1.3" }, + { name = "markdown", specifier = ">=3.10.2" }, { name = "openai", specifier = ">=2.33.0" }, { name = "pydantic", specifier = ">=2.13.3" }, { name = "tqdm", specifier = ">=4.67.3" }, @@ -61,6 +67,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, ] +[[package]] +name = "bleach" +version = "6.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "webencodings" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/07/18/3c8523962314be6bf4c8989c79ad9531c825210dd13a8669f6b84336e8bd/bleach-6.3.0.tar.gz", hash = "sha256:6f3b91b1c0a02bb9a78b5a454c92506aa0fdf197e1d5e114d2e00c6f64306d22", size = 203533, upload-time = "2025-10-27T17:57:39.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cd/3a/577b549de0cc09d95f11087ee63c739bba856cd3952697eec4c4bb91350a/bleach-6.3.0-py3-none-any.whl", hash = "sha256:fe10ec77c93ddf3d13a73b035abaac7a9f5e436513864ccdad516693213c65d6", size = 164437, upload-time = "2025-10-27T17:57:37.538Z" }, +] + +[[package]] +name = "blinker" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, +] + [[package]] name = "certifi" version = "2026.4.22" @@ -172,6 +199,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" }, ] +[[package]] +name = "click" +version = "8.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/98/518d8e5081007684232226f475082b30087d0f585e8457db087298259f49/click-8.4.1.tar.gz", hash = "sha256:918b5633eddf6b41c32d4f454bf0de810065c74e3f7dbf8ee5452f8be88d3e96", size = 353007, upload-time = "2026-05-22T04:08:37.769Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/0d/67e5b4109ea4a837e80daa87c2c696711955e40449a97e8926672534def2/click-8.4.1-py3-none-any.whl", hash = "sha256:482be17c6991b8c19c5429a1e995d9b0efdbb63172824c41f99965dc0ade8ec2", size = 116639, upload-time = "2026-05-22T04:08:35.26Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -223,6 +262,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, ] +[[package]] +name = "flask" +version = "3.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "blinker" }, + { name = "click" }, + { name = "itsdangerous" }, + { name = "jinja2" }, + { name = "markupsafe" }, + { name = "werkzeug" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" }, +] + [[package]] name = "frozendict" version = "2.4.7" @@ -278,6 +334,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl", hash = "sha256:892ea0cde124a99ce773decba204c5552b69c3c67ffd5f232eb7696135bc8bb3", size = 68629, upload-time = "2026-04-22T16:42:40.909Z" }, ] +[[package]] +name = "itsdangerous" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + [[package]] name = "jiter" version = "0.14.0" @@ -332,6 +409,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/2e/a44c20c58aeed0355f2d326969a181696aeb551a25195f47563908a815be/jiter-0.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:5419d4aa2024961da9fe12a9cfe7484996735dca99e8e090b5c88595ef1951ff", size = 191338, upload-time = "2026-04-10T14:28:02.853Z" }, ] +[[package]] +name = "markdown" +version = "3.10.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" }, +] + [[package]] name = "markdown-it-py" version = "4.0.0" @@ -344,6 +430,58 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, ] +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" }, + { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" }, + { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" }, + { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" }, + { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" }, + { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" }, + { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" }, + { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" }, + { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" }, + { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" }, + { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" }, + { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" }, + { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" }, + { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" }, + { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" }, + { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" }, + { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" }, + { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" }, + { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" }, + { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" }, + { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" }, + { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" }, + { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" }, + { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" }, + { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" }, + { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" }, + { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" }, + { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" }, + { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" }, + { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" }, + { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" }, + { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" }, + { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" }, + { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" }, + { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" }, + { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" }, + { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" }, + { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" }, + { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, + { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, + { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, +] + [[package]] name = "mdurl" version = "0.1.2" @@ -698,6 +836,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, ] +[[package]] +name = "webencodings" +version = "0.5.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/02/ae6ceac1baeda530866a85075641cec12989bd8d31af6d5ab4a3e8c92f47/webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923", size = 9721, upload-time = "2017-04-05T20:21:34.189Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774, upload-time = "2017-04-05T20:21:32.581Z" }, +] + [[package]] name = "websockets" version = "16.0" @@ -734,6 +881,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" }, ] +[[package]] +name = "werkzeug" +version = "3.1.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dd/b2/381be8cfdee792dd117872481b6e378f85c957dd7c5bca38897b08f765fd/werkzeug-3.1.8.tar.gz", hash = "sha256:9bad61a4268dac112f1c5cd4630a56ede601b6ed420300677a869083d70a4c44", size = 875852, upload-time = "2026-04-02T18:49:14.268Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/93/8c/2e650f2afeb7ee576912636c23ddb621c91ac6a98e66dc8d29c3c69446e1/werkzeug-3.1.8-py3-none-any.whl", hash = "sha256:63a77fb8892bf28ebc3178683445222aa500e48ebad5ec77b0ad80f8726b1f50", size = 226459, upload-time = "2026-04-02T18:49:12.72Z" }, +] + [[package]] name = "yfinance" version = "1.3.0" From 6e441d9a9bd17288a288da558ad13f8001498ada Mon Sep 17 00:00:00 2001 From: Charles Moslonka Date: Fri, 22 May 2026 11:54:30 +0200 Subject: [PATCH 2/8] MAINT: ignore OCR sessions --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 894a2bc..4db1aa8 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,9 @@ doc_text_processing/CEO_word_extraction/cleaning_extractions/cleaned/ KPI_analysis/cache/ KPI_analysis/output/ +# OCR annotation artifacts +annotation_OCR/sessions/ + # VSCode settings .vscode/settings.json From 1e3f055c58dc74e85bed6676bb6f355cae1fe935 Mon Sep 17 00:00:00 2001 From: Charles Moslonka Date: Fri, 22 May 2026 11:54:35 +0200 Subject: [PATCH 3/8] ENH: add OCR annotator --- annotation_OCR/__init__.py | 1 + annotation_OCR/ocr_index.py | 604 ++++++++++++++++++++++++++++ annotation_OCR/server.py | 365 +++++++++++++++++ annotation_OCR/static/app.js | 352 ++++++++++++++++ annotation_OCR/static/style.css | 448 +++++++++++++++++++++ annotation_OCR/store.py | 400 ++++++++++++++++++ annotation_OCR/summarize.py | 58 +++ annotation_OCR/templates/index.html | 204 ++++++++++ 8 files changed, 2432 insertions(+) create mode 100644 annotation_OCR/__init__.py create mode 100644 annotation_OCR/ocr_index.py create mode 100644 annotation_OCR/server.py create mode 100644 annotation_OCR/static/app.js create mode 100644 annotation_OCR/static/style.css create mode 100644 annotation_OCR/store.py create mode 100644 annotation_OCR/summarize.py create mode 100644 annotation_OCR/templates/index.html diff --git a/annotation_OCR/__init__.py b/annotation_OCR/__init__.py new file mode 100644 index 0000000..e045a18 --- /dev/null +++ b/annotation_OCR/__init__.py @@ -0,0 +1 @@ +"""OCR annotation interface package.""" diff --git a/annotation_OCR/ocr_index.py b/annotation_OCR/ocr_index.py new file mode 100644 index 0000000..8916981 --- /dev/null +++ b/annotation_OCR/ocr_index.py @@ -0,0 +1,604 @@ +"""Build page-level OCR annotation queues. + +The annotation UI compares one raw page image with the corresponding Markdown +page extracted by DeepSeekOCR. Page positions are preserved exactly: page index +``i`` in an ``.mmd`` split maps to ``pages/page_XXXX.png`` with the same +zero-based index when the raw image exists. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import random +import re +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + + +HERE = Path(__file__).resolve().parent +REPO_ROOT = HERE.parent + +DEFAULT_OCR_ROOT = REPO_ROOT / "DeepSeekOCR_Ardian_pruned_1k" +DEFAULT_RAW_ROOT = Path( + "/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs" +) + +PAGE_SPLIT_RE = re.compile(r"<---\s*Page Split\s*--->", re.IGNORECASE) +REPORT_NAME_RE = re.compile(r"^([A-Z0-9-]+)_(.+)_(\d{4})(?:_[0-9a-fA-F]{8,})?$") +HASH_SUFFIX_RE = re.compile(r"_[0-9a-fA-F]{8,}$") + +CORE_KPI_ALIASES = { + "revenue": [ + "net sales", + "total net sales", + "sales revenue", + "revenues", + "revenue", + "net revenue", + ], + "gross_profit": ["gross profit", "gross margin"], + "operating_income": [ + "operating income", + "income from operations", + "operating profit", + ], + "net_income": [ + "net income", + "net earnings", + "net loss", + "net income attributable", + ], + "total_assets": ["total assets"], + "total_liabilities": ["total liabilities", "liabilities"], + "cash_and_equivalents": [ + "cash and cash equivalents", + "cash equivalents", + "cash, cash equivalents", + ], + "operating_cash_flow": [ + "net cash provided by operating activities", + "cash flow from operating activities", + "operating cash flow", + ], + "capex": [ + "capital expenditures", + "capital expenditure", + "additions to property, plant and equipment", + "purchase of property and equipment", + "additions of long-lived assets", + ], +} + +FINANCIAL_TABLE_HEADINGS = [ + "consolidated statement of operations", + "consolidated statements of operations", + "consolidated income statement", + "consolidated statements of income", + "consolidated balance sheet", + "consolidated balance sheets", + "consolidated cash flow statement", + "consolidated statements of cash flows", + "consolidated statement of cash flows", + "statements of comprehensive income", + "statement of financial position", + "notes to the consolidated financial statements", + "selected financial data", + "five year record", +] + +NUMERIC_ROW_RE = re.compile( + r"(? dict[str, Any]: + record = asdict(self) + if not include_text: + record.pop("page_text", None) + return record + + +def parse_report_name(name: str) -> tuple[str, str, int] | None: + match = REPORT_NAME_RE.match(name) + if not match: + return None + return match.group(1), match.group(2), int(match.group(3)) + + +def strip_hash_suffix(name: str) -> str: + return HASH_SUFFIX_RE.sub("", name) + + +def report_base_name(name: str) -> str: + parsed = parse_report_name(name) + if parsed is None: + return strip_hash_suffix(name) + exchange, ticker, year = parsed + return f"{exchange}_{ticker}_{year}" + + +def find_mmd(report_dir: Path) -> Path | None: + preferred = report_dir / f"{report_dir.name}.mmd" + if preferred.is_file(): + return preferred + + base_preferred = report_dir / f"{report_base_name(report_dir.name)}.mmd" + if base_preferred.is_file(): + return base_preferred + + candidates = sorted( + path for path in report_dir.glob("*.mmd") if not path.name.endswith("_det.mmd") + ) + if candidates: + return candidates[0] + + fallback = sorted(report_dir.glob("*.mmd")) + return fallback[0] if fallback else None + + +def discover_reports(root: Path) -> list[ReportInfo]: + reports: list[ReportInfo] = [] + seen_dirs = sorted({mmd.parent for mmd in root.rglob("*.mmd")}) + for report_dir in seen_dirs: + parsed = parse_report_name(report_dir.name) + if parsed is None: + continue + mmd_path = find_mmd(report_dir) + if mmd_path is None: + continue + exchange, ticker, year = parsed + industry_slug = report_dir.parent.name + reports.append( + ReportInfo( + industry_slug=industry_slug, + name=report_dir.name, + exchange=exchange, + ticker=ticker, + year=year, + report_dir=report_dir, + mmd_path=mmd_path, + ) + ) + return reports + + +def split_pages(raw: str) -> list[str]: + pages = [page.strip() for page in PAGE_SPLIT_RE.split(raw)] + if pages and not pages[-1]: + pages.pop() + return pages + + +def load_pages(mmd_path: Path) -> list[str]: + raw = mmd_path.read_text(encoding="utf-8", errors="replace") + return split_pages(raw) + + +def resolve_raw_dir(report: ReportInfo, raw_root: Path) -> tuple[Path | None, str]: + industry_root = raw_root / report.industry_slug + if not industry_root.is_dir(): + return None, "raw-industry-missing" + + exact = industry_root / report.name + if exact.is_dir(): + return exact, "ok-exact" + + base_name = report_base_name(report.name) + stripped = industry_root / base_name + if stripped.is_dir(): + return stripped, "ok-hash-stripped" + + matches = sorted( + path for path in industry_root.glob(f"{base_name}*") if path.is_dir() + ) + if len(matches) == 1: + return matches[0], "ok-glob" + if len(matches) > 1: + return None, "raw-dir-ambiguous" + return None, "raw-dir-missing" + + +def list_page_pngs(raw_dir: Path | None) -> list[Path]: + if raw_dir is None: + return [] + pages_dir = raw_dir / "pages" + if not pages_dir.is_dir(): + return [] + return sorted(p for p in pages_dir.glob("page_*.png") if p.is_file()) + + +def page_png_for(page_pngs: list[Path], page_index: int) -> Path | None: + expected_name = f"page_{page_index:04d}.png" + for path in page_pngs: + if path.name == expected_name: + return path + if 0 <= page_index < len(page_pngs): + return page_pngs[page_index] + return None + + +def has_markdown_table(lines: list[str]) -> bool: + if any(MARKDOWN_TABLE_SEPARATOR_RE.match(line) for line in lines): + return True + pipe_rows = sum(1 for line in lines if line.count("|") >= 2) + return pipe_rows >= 2 + + +def dense_numeric_row_count(lines: list[str]) -> int: + return sum(1 for line in lines if len(NUMERIC_ROW_RE.findall(line)) >= 3) + + +def detect_candidate_reasons(text: str) -> list[str]: + lowered = text.lower() + lines = [line.strip() for line in text.splitlines() if line.strip()] + reasons: list[str] = [] + + if has_markdown_table(lines): + reasons.append("markdown-table") + if "" in lowered or "" in lowered: + reasons.append("html-table") + + numeric_rows = dense_numeric_row_count(lines) + if numeric_rows >= 3: + reasons.append("dense-numeric-rows") + + if any(heading in lowered for heading in FINANCIAL_TABLE_HEADINGS): + reasons.append("financial-heading") + + aliases = sorted({alias for vals in CORE_KPI_ALIASES.values() for alias in vals}) + alias_hits = [alias for alias in aliases if alias in lowered] + if len(alias_hits) >= 2: + reasons.append("kpi-aliases") + + return reasons + + +def text_preview(text: str, max_chars: int = 500) -> str: + compact = " ".join(text.split()) + if len(compact) <= max_chars: + return compact + return compact[: max_chars - 1].rstrip() + "..." + + +def page_text_hash(text: str) -> str: + return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() + + +def make_mapping_warnings( + *, raw_dir: Path | None, page_pngs: list[Path], page_index: int, mmd_page_count: int +) -> list[str]: + warnings: list[str] = [] + if raw_dir is None: + warnings.append("raw-directory-missing") + elif not (raw_dir / "pages").is_dir(): + warnings.append("raw-pages-directory-missing") + if len(page_pngs) != mmd_page_count: + warnings.append("page-count-mismatch") + if page_png_for(page_pngs, page_index) is None: + warnings.append("raw-page-image-missing") + return warnings + + +def build_all_items( + *, + ocr_root: Path, + raw_root: Path, + limit_reports: int | None = None, +) -> list[PageItem]: + return list( + iter_page_items( + ocr_root=ocr_root, + raw_root=raw_root, + limit_reports=limit_reports, + ) + ) + + +def iter_page_items( + *, + ocr_root: Path, + raw_root: Path, + limit_reports: int | None = None, +): + reports = discover_reports(ocr_root) + if limit_reports is not None: + reports = reports[:limit_reports] + + for report in reports: + pages = load_pages(report.mmd_path) + raw_dir, raw_status = resolve_raw_dir(report, raw_root) + page_pngs = list_page_pngs(raw_dir) + mmd_page_count = len(pages) + png_page_count = len(page_pngs) + + for page_index, page_text in enumerate(pages): + raw_png = page_png_for(page_pngs, page_index) + warnings = make_mapping_warnings( + raw_dir=raw_dir, + page_pngs=page_pngs, + page_index=page_index, + mmd_page_count=mmd_page_count, + ) + reasons = detect_candidate_reasons(page_text) + item_id = f"{report.industry_slug}/{report.name}/page_{page_index:04d}" + yield PageItem( + item_id=item_id, + industry_slug=report.industry_slug, + report_name=report.name, + exchange=report.exchange, + ticker=report.ticker, + year=report.year, + page_index=page_index, + page_number=page_index + 1, + ocr_root=str(ocr_root), + raw_root=str(raw_root), + report_dir=str(report.report_dir), + raw_dir=str(raw_dir) if raw_dir else None, + mmd_path=str(report.mmd_path), + raw_png_path=str(raw_png) if raw_png else None, + mmd_page_count=mmd_page_count, + png_page_count=png_page_count, + mapping_status=raw_status, + mapping_warnings=warnings, + candidate_reasons=reasons, + page_text_sha256=page_text_hash(page_text), + page_text_chars=len(page_text), + page_text_preview=text_preview(page_text), + page_text="", + ) + + +def new_summary_state() -> dict[str, Any]: + return { + "report_names": set(), + "pages_total": 0, + "mapping_status_counts": {}, + "mapping_warning_counts": {}, + "candidate_reason_counts": {}, + } + + +def update_summary_state(state: dict[str, Any], item: PageItem) -> None: + state["report_names"].add(item.report_name) + state["pages_total"] += 1 + statuses = state["mapping_status_counts"] + statuses[item.mapping_status] = statuses.get(item.mapping_status, 0) + 1 + warnings = state["mapping_warning_counts"] + for warning in item.mapping_warnings: + warnings[warning] = warnings.get(warning, 0) + 1 + reasons = state["candidate_reason_counts"] + for reason in item.candidate_reasons: + reasons[reason] = reasons.get(reason, 0) + 1 + + +def finish_summary_state( + state: dict[str, Any], queue: list[PageItem] +) -> dict[str, Any]: + return { + "reports_total": len(state["report_names"]), + "pages_total": state["pages_total"], + "queue_reports": len({item.report_name for item in queue}), + "queue_pages": len(queue), + "mapping_status_counts": state["mapping_status_counts"], + "mapping_warning_counts": state["mapping_warning_counts"], + "candidate_reason_counts": state["candidate_reason_counts"], + } + + +def select_queue( + items: list[PageItem], + *, + queue_mode: str, + sample_size: int | None = None, + seed: int = 17, + limit: int | None = None, +) -> list[PageItem]: + if queue_mode == "all": + selected = list(items) + elif queue_mode == "table-candidates": + selected = [item for item in items if item.candidate_reasons] + elif queue_mode == "sample": + size = sample_size if sample_size is not None else 100 + rng = random.Random(seed) + selected = rng.sample(items, min(size, len(items))) + selected.sort( + key=lambda item: (item.industry_slug, item.report_name, item.page_index) + ) + else: + raise ValueError(f"unknown queue mode: {queue_mode}") + + if limit is not None: + selected = selected[:limit] + return selected + + +def build_queue( + *, + ocr_root: Path, + raw_root: Path, + queue_mode: str = "table-candidates", + sample_size: int | None = None, + seed: int = 17, + limit: int | None = None, + limit_reports: int | None = None, +) -> tuple[list[PageItem], dict[str, Any]]: + if queue_mode not in {"all", "table-candidates", "sample"}: + raise ValueError(f"unknown queue mode: {queue_mode}") + + queue: list[PageItem] = [] + summary_state = new_summary_state() + rng = random.Random(seed) + sample_seen = 0 + sample_target = sample_size if sample_size is not None else 100 + scan_stopped_by_limit = False + + for item in iter_page_items( + ocr_root=ocr_root, + raw_root=raw_root, + limit_reports=limit_reports, + ): + update_summary_state(summary_state, item) + if queue_mode == "sample": + sample_seen += 1 + if len(queue) < sample_target: + queue.append(item) + else: + replace_at = rng.randint(0, sample_seen - 1) + if replace_at < sample_target: + queue[replace_at] = item + continue + + include_item = queue_mode == "all" or bool(item.candidate_reasons) + if not include_item: + continue + queue.append(item) + if limit is not None and len(queue) >= limit: + scan_stopped_by_limit = True + break + + if queue_mode == "sample": + queue.sort( + key=lambda item: (item.industry_slug, item.report_name, item.page_index) + ) + if limit is not None: + queue = queue[:limit] + + summary = finish_summary_state(summary_state, queue) + summary.update( + { + "queue_mode": queue_mode, + "sample_size": sample_size, + "seed": seed, + "limit": limit, + "limit_reports": limit_reports, + "scan_stopped_by_limit": scan_stopped_by_limit, + "ocr_root": str(ocr_root), + "raw_root": str(raw_root), + } + ) + return queue, summary + + +def summarize_items(all_items: list[PageItem], queue: list[PageItem]) -> dict[str, Any]: + report_names = {item.report_name for item in all_items} + queue_reports = {item.report_name for item in queue} + warnings: dict[str, int] = {} + statuses: dict[str, int] = {} + reason_counts: dict[str, int] = {} + for item in all_items: + statuses[item.mapping_status] = statuses.get(item.mapping_status, 0) + 1 + for warning in item.mapping_warnings: + warnings[warning] = warnings.get(warning, 0) + 1 + for reason in item.candidate_reasons: + reason_counts[reason] = reason_counts.get(reason, 0) + 1 + return { + "reports_total": len(report_names), + "pages_total": len(all_items), + "queue_reports": len(queue_reports), + "queue_pages": len(queue), + "mapping_status_counts": statuses, + "mapping_warning_counts": warnings, + "candidate_reason_counts": reason_counts, + } + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(payload, indent=2), encoding="utf-8") + tmp.replace(path) + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Build an OCR page annotation queue.") + parser.add_argument("--ocr-root", type=Path, default=DEFAULT_OCR_ROOT) + parser.add_argument("--raw-root", type=Path, default=DEFAULT_RAW_ROOT) + parser.add_argument( + "--queue-mode", + choices=["all", "table-candidates", "sample"], + default="table-candidates", + ) + parser.add_argument("--sample-size", type=int, default=None) + parser.add_argument("--seed", type=int, default=17) + parser.add_argument("--limit", type=int, default=None, help="Maximum queued pages.") + parser.add_argument( + "--limit-reports", + type=int, + default=None, + help="Read only the first N reports before queue selection.", + ) + parser.add_argument( + "--output", type=Path, default=None, help="Optional manifest JSON path." + ) + parser.add_argument("--check", action="store_true", help="Print summary and exit.") + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + queue, summary = build_queue( + ocr_root=args.ocr_root, + raw_root=args.raw_root, + queue_mode=args.queue_mode, + sample_size=args.sample_size, + seed=args.seed, + limit=args.limit, + limit_reports=args.limit_reports, + ) + + payload = { + "summary": summary, + "items": [item.to_manifest_record() for item in queue], + } + if args.output: + write_json(args.output, payload) + if args.check or not args.output: + print(json.dumps(summary, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/annotation_OCR/server.py b/annotation_OCR/server.py new file mode 100644 index 0000000..6ea3793 --- /dev/null +++ b/annotation_OCR/server.py @@ -0,0 +1,365 @@ +"""Browser-based OCR page annotation server.""" + +from __future__ import annotations + +import argparse +import re +from functools import lru_cache +from pathlib import Path +from typing import Any + +import bleach +import markdown as markdown_lib +from flask import Flask, abort, jsonify, render_template, request, send_file + +from ocr_index import DEFAULT_OCR_ROOT, DEFAULT_RAW_ROOT, build_queue, load_pages +from store import ( + create_session, + list_sessions, + load_current_annotations, + load_manifest, + load_metadata, + save_annotation, + session_dir, + write_summary_files, +) + + +HERE = Path(__file__).resolve().parent +IMAGE_REF_RE = re.compile(r"(!\[[^\]]*\]\()((?:\./)?images/[^)\s]+)(\))") +HTML_IMAGE_SRC_RE = re.compile(r'(]*\bsrc=["\'])(images/[^"\']+)(["\'])', re.I) + +ALLOWED_TAGS = set(bleach.sanitizer.ALLOWED_TAGS).union( + { + "p", + "br", + "pre", + "code", + "hr", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "table", + "thead", + "tbody", + "tfoot", + "tr", + "th", + "td", + "img", + "blockquote", + "del", + } +) +ALLOWED_ATTRIBUTES = { + **bleach.sanitizer.ALLOWED_ATTRIBUTES, + "a": ["href", "title", "rel", "target"], + "img": ["src", "alt", "title"], + "th": ["align", "colspan", "rowspan"], + "td": ["align", "colspan", "rowspan"], +} + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Run the OCR annotation web UI.") + parser.add_argument("--ocr-root", type=Path, default=DEFAULT_OCR_ROOT) + parser.add_argument("--raw-root", type=Path, default=DEFAULT_RAW_ROOT) + parser.add_argument( + "--session-id", default=None, help="Resume an existing session." + ) + parser.add_argument("--session-name", default="OCR annotation session") + parser.add_argument("--annotator", default="anonymous") + parser.add_argument( + "--queue-mode", + choices=["all", "table-candidates", "sample"], + default="table-candidates", + ) + parser.add_argument("--sample-size", type=int, default=None) + parser.add_argument("--seed", type=int, default=17) + parser.add_argument("--limit", type=int, default=None, help="Maximum queued pages.") + parser.add_argument( + "--limit-reports", + type=int, + default=None, + help="Read only the first N reports before queue selection.", + ) + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=5050) + parser.add_argument("--debug", action="store_true") + return parser + + +def prepare_session(args: argparse.Namespace) -> str: + if args.session_id: + metadata = load_metadata(args.session_id) + return metadata["session_id"] + + queue, index_summary = build_queue( + ocr_root=args.ocr_root, + raw_root=args.raw_root, + queue_mode=args.queue_mode, + sample_size=args.sample_size, + seed=args.seed, + limit=args.limit, + limit_reports=args.limit_reports, + ) + config = { + "ocr_root": str(args.ocr_root), + "raw_root": str(args.raw_root), + "queue_mode": args.queue_mode, + "sample_size": args.sample_size, + "seed": args.seed, + "limit": args.limit, + "limit_reports": args.limit_reports, + } + metadata = create_session( + session_name=args.session_name, + annotator=args.annotator, + manifest_items=[item.to_manifest_record() for item in queue], + index_summary=index_summary, + config=config, + ) + return metadata["session_id"] + + +@lru_cache(maxsize=64) +def cached_pages(mmd_path: str) -> tuple[str, ...]: + return tuple(load_pages(Path(mmd_path))) + + +def get_item_or_404(session_id: str, index: int) -> dict[str, Any]: + manifest = load_manifest(session_id) + if index < 0 or index >= len(manifest): + abort(404, description="item index out of range") + return manifest[index] + + +def item_page_text(item: dict[str, Any]) -> str: + pages = cached_pages(item["mmd_path"]) + page_index = int(item.get("page_index", 0)) + if page_index < 0 or page_index >= len(pages): + return "" + return pages[page_index] + + +def rewrite_markdown_image_refs(markdown_text: str, session_id: str, index: int) -> str: + def replace_md(match: re.Match[str]) -> str: + rel_path = match.group(2).lstrip("./") + src = f"/api/session/{session_id}/item/{index}/inline-image/{rel_path}" + return f"{match.group(1)}{src}{match.group(3)}" + + return IMAGE_REF_RE.sub(replace_md, markdown_text) + + +def rewrite_html_image_refs(html: str, session_id: str, index: int) -> str: + def replace_html(match: re.Match[str]) -> str: + rel_path = match.group(2).lstrip("./") + src = f"/api/session/{session_id}/item/{index}/inline-image/{rel_path}" + return f"{match.group(1)}{src}{match.group(3)}" + + return HTML_IMAGE_SRC_RE.sub(replace_html, html) + + +def render_markdown_page(markdown_text: str, session_id: str, index: int) -> str: + rewritten = rewrite_markdown_image_refs(markdown_text, session_id, index) + html = markdown_lib.markdown( + rewritten, + extensions=["tables", "fenced_code", "sane_lists", "nl2br"], + output_format="html5", + ) + html = rewrite_html_image_refs(html, session_id, index) + return bleach.clean( + html, + tags=ALLOWED_TAGS, + attributes=ALLOWED_ATTRIBUTES, + protocols=["http", "https", "mailto", "data"], + ) + + +def safe_child_path(root: Path, relative_path: str) -> Path: + candidate = Path(relative_path) + if candidate.is_absolute() or ".." in candidate.parts: + abort(400, description="unsafe path") + resolved_root = root.resolve() + target = (resolved_root / candidate).resolve() + if not target.is_relative_to(resolved_root): + abort(400, description="unsafe path") + return target + + +def progress_payload(session_id: str) -> dict[str, Any]: + metadata = load_metadata(session_id) + manifest = load_manifest(session_id) + current = load_current_annotations(session_id) + status_counts: dict[str, int] = {} + for item in manifest: + status = current.get(item["item_id"], {}).get("overall_status", "unreviewed") + status_counts[status] = status_counts.get(status, 0) + 1 + + next_unreviewed_index = None + for index, item in enumerate(manifest): + if item["item_id"] not in current: + next_unreviewed_index = index + break + + return { + "metadata": metadata, + "item_count": len(manifest), + "reviewed_count": len(current), + "status_counts": status_counts, + "next_unreviewed_index": next_unreviewed_index, + } + + +def create_app(default_session_id: str, build_defaults: dict[str, Any]) -> Flask: + app = Flask(__name__, template_folder="templates", static_folder="static") + app.config["DEFAULT_SESSION_ID"] = default_session_id + app.config["BUILD_DEFAULTS"] = build_defaults + + @app.get("/") + def index() -> str: + return render_template("index.html", default_session_id=default_session_id) + + @app.get("/api/sessions") + def api_sessions() -> Any: + return jsonify( + {"sessions": list_sessions(), "default_session_id": default_session_id} + ) + + @app.post("/api/sessions") + def api_create_session() -> Any: + payload = request.get_json(force=True, silent=True) or {} + defaults = app.config["BUILD_DEFAULTS"] + queue_mode = payload.get("queue_mode") or defaults["queue_mode"] + queue, index_summary = build_queue( + ocr_root=Path(payload.get("ocr_root") or defaults["ocr_root"]), + raw_root=Path(payload.get("raw_root") or defaults["raw_root"]), + queue_mode=queue_mode, + sample_size=payload.get("sample_size", defaults.get("sample_size")), + seed=int(payload.get("seed", defaults["seed"])), + limit=payload.get("limit", defaults.get("limit")), + limit_reports=payload.get("limit_reports", defaults.get("limit_reports")), + ) + config = { + "ocr_root": payload.get("ocr_root") or defaults["ocr_root"], + "raw_root": payload.get("raw_root") or defaults["raw_root"], + "queue_mode": queue_mode, + "sample_size": payload.get("sample_size", defaults.get("sample_size")), + "seed": int(payload.get("seed", defaults["seed"])), + "limit": payload.get("limit", defaults.get("limit")), + "limit_reports": payload.get( + "limit_reports", defaults.get("limit_reports") + ), + } + metadata = create_session( + session_name=str(payload.get("session_name") or "OCR annotation session"), + annotator=str(payload.get("annotator") or "anonymous"), + manifest_items=[item.to_manifest_record() for item in queue], + index_summary=index_summary, + config=config, + ) + return jsonify( + {"metadata": metadata, "progress": progress_payload(metadata["session_id"])} + ) + + @app.get("/api/session/") + def api_session(session_id: str) -> Any: + return jsonify(progress_payload(session_id)) + + @app.get("/api/session//item/") + def api_item(session_id: str, index: int) -> Any: + item = get_item_or_404(session_id, index) + text = item_page_text(item) + annotations = load_current_annotations(session_id) + return jsonify( + { + "index": index, + "item_count": len(load_manifest(session_id)), + "item": item, + "annotation": annotations.get(item["item_id"]), + "page_text": text, + "markdown_html": render_markdown_page(text, session_id, index), + "image_url": f"/api/session/{session_id}/item/{index}/raw-image", + } + ) + + @app.get("/api/session//item//raw-image") + def api_raw_image(session_id: str, index: int) -> Any: + item = get_item_or_404(session_id, index) + raw_png_path = item.get("raw_png_path") + if not raw_png_path: + abort(404, description="raw page image missing") + target = Path(raw_png_path).resolve() + raw_root = Path(item.get("raw_root") or "/").resolve() + if not target.is_relative_to(raw_root): + abort(400, description="raw image outside raw root") + if not target.is_file(): + abort(404, description="raw page image missing") + return send_file(target) + + @app.get("/api/session//item//inline-image/") + def api_inline_image(session_id: str, index: int, rel_path: str) -> Any: + item = get_item_or_404(session_id, index) + report_dir = Path(item["report_dir"]) + target = safe_child_path(report_dir, rel_path) + if not target.is_file(): + abort(404, description="inline OCR image missing") + return send_file(target) + + @app.post("/api/session//annotation") + def api_save_annotation(session_id: str) -> Any: + payload = request.get_json(force=True, silent=False) or {} + item_id = payload.get("item_id") + if not item_id: + abort(400, description="missing item_id") + record = save_annotation( + session_id=session_id, item_id=str(item_id), payload=payload + ) + return jsonify({"annotation": record, "progress": progress_payload(session_id)}) + + @app.get("/api/session//progress") + def api_progress(session_id: str) -> Any: + return jsonify(progress_payload(session_id)) + + @app.post("/api/session//summarize") + def api_summarize(session_id: str) -> Any: + paths = write_summary_files(session_id) + return jsonify({"paths": paths, "progress": progress_payload(session_id)}) + + @app.get("/api/session//summary.csv") + def api_summary_csv(session_id: str) -> Any: + write_summary_files(session_id) + return send_file(session_dir(session_id) / "summary.csv", as_attachment=True) + + @app.get("/api/session//summary.md") + def api_summary_md(session_id: str) -> Any: + write_summary_files(session_id) + return send_file(session_dir(session_id) / "summary.md", as_attachment=True) + + return app + + +def main(argv: list[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + session_id = prepare_session(args) + build_defaults = { + "ocr_root": str(args.ocr_root), + "raw_root": str(args.raw_root), + "queue_mode": args.queue_mode, + "sample_size": args.sample_size, + "seed": args.seed, + "limit": args.limit, + "limit_reports": args.limit_reports, + } + app = create_app(session_id, build_defaults) + print(f"Annotation session: {session_id}") + print(f"Open: http://{args.host}:{args.port}") + app.run(host=args.host, port=args.port, debug=args.debug) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/annotation_OCR/static/app.js b/annotation_OCR/static/app.js new file mode 100644 index 0000000..fd49ccc --- /dev/null +++ b/annotation_OCR/static/app.js @@ -0,0 +1,352 @@ +const state = { + sessionId: window.OCR_ANNOTATION_DEFAULT_SESSION_ID, + index: 0, + itemCount: 0, + item: null, + overallStatus: 'unreviewed', + startedAt: null, + zoom: 1, + showingRaw: false, + saving: false, +}; + +const els = { + sessionTitle: document.getElementById('sessionTitle'), + sessionMeta: document.getElementById('sessionMeta'), + progressText: document.getElementById('progressText'), + progressBar: document.getElementById('progressBar'), + prevButton: document.getElementById('prevButton'), + nextButton: document.getElementById('nextButton'), + skipReviewedButton: document.getElementById('skipReviewedButton'), + helpButton: document.getElementById('helpButton'), + rawImage: document.getElementById('rawImage'), + imageMissing: document.getElementById('imageMissing'), + imageSubtitle: document.getElementById('imageSubtitle'), + markdownSubtitle: document.getElementById('markdownSubtitle'), + markdownPreview: document.getElementById('markdownPreview'), + rawMarkdown: document.getElementById('rawMarkdown'), + toggleRawButton: document.getElementById('toggleRawButton'), + zoomOutButton: document.getElementById('zoomOutButton'), + zoomResetButton: document.getElementById('zoomResetButton'), + zoomInButton: document.getElementById('zoomInButton'), + reportName: document.getElementById('reportName'), + industryValue: document.getElementById('industryValue'), + tickerValue: document.getElementById('tickerValue'), + pageValue: document.getElementById('pageValue'), + signalsValue: document.getElementById('signalsValue'), + mappingValue: document.getElementById('mappingValue'), + notesInput: document.getElementById('notesInput'), + issueGrid: document.getElementById('issueGrid'), + saveButton: document.getElementById('saveButton'), + saveStatus: document.getElementById('saveStatus'), + summaryCsvLink: document.getElementById('summaryCsvLink'), + summaryMdLink: document.getElementById('summaryMdLink'), + helpDialog: document.getElementById('helpDialog'), +}; + +function apiJson(url, options = {}) { + return fetch(url, { + headers: { 'Content-Type': 'application/json' }, + ...options, + }).then(async (response) => { + if (!response.ok) { + const text = await response.text(); + throw new Error(text || `${response.status} ${response.statusText}`); + } + return response.json(); + }); +} + +function statusMessage(message, tone = 'neutral') { + els.saveStatus.textContent = message; + els.saveStatus.dataset.tone = tone; +} + +function formatList(values) { + if (!values || values.length === 0) return 'none'; + return values.join(', '); +} + +function updateProgress(progress) { + const metadata = progress.metadata || {}; + state.itemCount = progress.item_count || 0; + els.sessionTitle.textContent = metadata.session_name || metadata.session_id || 'Session'; + els.sessionMeta.textContent = `${metadata.annotator || 'anonymous'} ยท ${metadata.session_id || state.sessionId}`; + const reviewed = progress.reviewed_count || 0; + const total = progress.item_count || 0; + els.progressText.textContent = `${reviewed} / ${total} reviewed`; + const pct = total ? Math.round((reviewed / total) * 100) : 0; + els.progressBar.style.width = `${pct}%`; + els.summaryCsvLink.href = `/api/session/${state.sessionId}/summary.csv`; + els.summaryMdLink.href = `/api/session/${state.sessionId}/summary.md`; +} + +function setOverall(status) { + state.overallStatus = status; + document.querySelectorAll('.status-button').forEach((button) => { + button.classList.toggle('active', button.dataset.status === status); + }); +} + +function setSubchecks(values = {}) { + document.querySelectorAll('[data-subcheck]').forEach((select) => { + select.value = values[select.dataset.subcheck] || 'unreviewed'; + }); +} + +function setIssues(values = []) { + const selected = new Set(values); + els.issueGrid.querySelectorAll('input[type="checkbox"]').forEach((checkbox) => { + checkbox.checked = selected.has(checkbox.value); + }); +} + +function getSubchecks() { + const subchecks = {}; + document.querySelectorAll('[data-subcheck]').forEach((select) => { + subchecks[select.dataset.subcheck] = select.value; + }); + return subchecks; +} + +function getIssues() { + return Array.from(els.issueGrid.querySelectorAll('input[type="checkbox"]:checked')) + .map((checkbox) => checkbox.value) + .sort(); +} + +function loadAnnotation(annotation) { + setOverall(annotation?.overall_status || 'unreviewed'); + setSubchecks(annotation?.subchecks || {}); + setIssues(annotation?.issue_tags || []); + els.notesInput.value = annotation?.notes || ''; +} + +function applyZoom() { + els.rawImage.style.transform = `scale(${state.zoom})`; + els.rawImage.style.marginBottom = `${Math.max(0, (state.zoom - 1) * 100)}%`; + els.zoomResetButton.textContent = `${Math.round(state.zoom * 100)}%`; +} + +function setZoom(value) { + state.zoom = Math.min(3, Math.max(0.35, value)); + applyZoom(); +} + +async function loadProgress() { + const progress = await apiJson(`/api/session/${state.sessionId}`); + updateProgress(progress); + return progress; +} + +async function loadItem(index) { + const safeIndex = Math.max(0, Math.min(index, Math.max(0, state.itemCount - 1))); + const data = await apiJson(`/api/session/${state.sessionId}/item/${safeIndex}`); + state.index = safeIndex; + state.item = data.item; + state.itemCount = data.item_count; + state.startedAt = new Date(); + + els.reportName.textContent = data.item.report_name; + els.industryValue.textContent = data.item.industry_slug; + els.tickerValue.textContent = `${data.item.exchange}:${data.item.ticker} ยท ${data.item.year}`; + els.pageValue.textContent = `${data.item.page_number} / ${data.item.mmd_page_count}`; + els.signalsValue.textContent = formatList(data.item.candidate_reasons); + els.mappingValue.textContent = [data.item.mapping_status, ...data.item.mapping_warnings].filter(Boolean).join(' ยท '); + els.imageSubtitle.textContent = data.item.raw_png_path || 'No raw image path'; + els.markdownSubtitle.textContent = `${data.item.page_text_chars} chars ยท ${data.item.page_text_sha256.slice(0, 12)}`; + + els.markdownPreview.innerHTML = data.markdown_html || ''; + els.rawMarkdown.textContent = data.page_text || ''; + + if (data.item.raw_png_path) { + els.rawImage.hidden = false; + els.imageMissing.hidden = true; + els.rawImage.src = `${data.image_url}?v=${encodeURIComponent(data.item.page_text_sha256)}`; + } else { + els.rawImage.hidden = true; + els.imageMissing.hidden = false; + els.rawImage.removeAttribute('src'); + } + + loadAnnotation(data.annotation); + setZoom(1); + statusMessage(`Loaded item ${safeIndex + 1} of ${data.item_count}`); + els.prevButton.disabled = safeIndex === 0; + els.nextButton.disabled = safeIndex >= data.item_count - 1; +} + +function annotationPayload(source = 'manual') { + return { + item_id: state.item.item_id, + overall_status: state.overallStatus, + subchecks: getSubchecks(), + issue_tags: getIssues(), + notes: els.notesInput.value, + annotation_source: source, + review_duration_ms: state.startedAt ? new Date() - state.startedAt : null, + client_started_at_utc: state.startedAt ? state.startedAt.toISOString() : null, + client_updated_at_utc: new Date().toISOString(), + }; +} + +async function saveAnnotation(source = 'manual', advance = false) { + if (!state.item || state.saving) return; + state.saving = true; + els.saveButton.disabled = true; + statusMessage('Saving...'); + try { + const data = await apiJson(`/api/session/${state.sessionId}/annotation`, { + method: 'POST', + body: JSON.stringify(annotationPayload(source)), + }); + updateProgress(data.progress); + statusMessage('Saved', 'ok'); + if (advance && state.index < state.itemCount - 1) { + await loadItem(state.index + 1); + await loadProgress(); + } + } catch (error) { + statusMessage(`Save failed: ${error.message}`, 'error'); + } finally { + state.saving = false; + els.saveButton.disabled = false; + } +} + +function quickMark(status) { + setOverall(status); + if (status === 'ok') { + setSubchecks({ + text_content: 'ok', + table_content: 'ok', + table_structure: 'ok', + page_alignment: 'ok', + }); + setIssues([]); + } else if (status === 'not_ok') { + const subchecks = getSubchecks(); + if (Object.values(subchecks).every((value) => value === 'unreviewed')) { + setSubchecks({ + text_content: 'uncertain', + table_content: 'uncertain', + table_structure: 'not_ok', + page_alignment: 'uncertain', + }); + } + } else if (status === 'uncertain') { + setSubchecks({ + text_content: 'uncertain', + table_content: 'uncertain', + table_structure: 'uncertain', + page_alignment: 'uncertain', + }); + } + saveAnnotation(`shortcut:${status}`, true); +} + +function toggleIssue(tag) { + const checkbox = els.issueGrid.querySelector(`input[value="${tag}"]`); + if (checkbox) checkbox.checked = !checkbox.checked; +} + +async function go(delta) { + const target = state.index + delta; + if (target < 0 || target >= state.itemCount) return; + await loadItem(target); + await loadProgress(); +} + +async function goNextOpen() { + const progress = await loadProgress(); + if (progress.next_unreviewed_index === null || progress.next_unreviewed_index === undefined) { + statusMessage('No open items'); + return; + } + await loadItem(progress.next_unreviewed_index); +} + +function toggleRawMarkdown() { + state.showingRaw = !state.showingRaw; + els.rawMarkdown.hidden = !state.showingRaw; + els.markdownPreview.hidden = state.showingRaw; + els.toggleRawButton.textContent = state.showingRaw ? 'Rendered' : 'Raw Markdown'; +} + +function inputHasFocus() { + const active = document.activeElement; + return active && ['TEXTAREA', 'INPUT', 'SELECT'].includes(active.tagName); +} + +function setupEvents() { + els.prevButton.addEventListener('click', () => go(-1)); + els.nextButton.addEventListener('click', () => go(1)); + els.skipReviewedButton.addEventListener('click', goNextOpen); + els.saveButton.addEventListener('click', () => saveAnnotation('manual', false)); + els.toggleRawButton.addEventListener('click', toggleRawMarkdown); + els.zoomOutButton.addEventListener('click', () => setZoom(state.zoom - 0.15)); + els.zoomInButton.addEventListener('click', () => setZoom(state.zoom + 0.15)); + els.zoomResetButton.addEventListener('click', () => setZoom(1)); + els.helpButton.addEventListener('click', () => els.helpDialog.showModal()); + document.querySelectorAll('.status-button').forEach((button) => { + button.addEventListener('click', () => setOverall(button.dataset.status)); + }); + + document.addEventListener('keydown', (event) => { + if (inputHasFocus()) return; + if (event.key === '?') { + event.preventDefault(); + els.helpDialog.showModal(); + } else if (event.key.toLowerCase() === 'a') { + event.preventDefault(); + quickMark('ok'); + } else if (event.key.toLowerCase() === 'r') { + event.preventDefault(); + quickMark('not_ok'); + } else if (event.key.toLowerCase() === 'u') { + event.preventDefault(); + quickMark('uncertain'); + } else if (event.key === 'ArrowRight' || event.key.toLowerCase() === 'j') { + event.preventDefault(); + go(1); + } else if (event.key === 'ArrowLeft' || event.key.toLowerCase() === 'k') { + event.preventDefault(); + go(-1); + } else if (event.key.toLowerCase() === 't') { + event.preventDefault(); + toggleIssue('broken_table'); + } else if (event.key.toLowerCase() === 'c') { + event.preventDefault(); + toggleIssue('merged_columns'); + } else if (event.key.toLowerCase() === 'm') { + event.preventDefault(); + toggleIssue('missing_text'); + } else if (event.key === '+' || event.key === '=') { + event.preventDefault(); + setZoom(state.zoom + 0.15); + } else if (event.key === '-') { + event.preventDefault(); + setZoom(state.zoom - 0.15); + } else if (event.key === '0') { + event.preventDefault(); + setZoom(1); + } + }); +} + +async function init() { + setupEvents(); + try { + const progress = await loadProgress(); + const startIndex = progress.next_unreviewed_index ?? 0; + if (progress.item_count > 0) { + await loadItem(startIndex); + } else { + statusMessage('Session has no queued items', 'error'); + } + } catch (error) { + statusMessage(`Startup failed: ${error.message}`, 'error'); + } +} + +init(); \ No newline at end of file diff --git a/annotation_OCR/static/style.css b/annotation_OCR/static/style.css new file mode 100644 index 0000000..deaa160 --- /dev/null +++ b/annotation_OCR/static/style.css @@ -0,0 +1,448 @@ +:root { + --bg: #edf1f2; + --panel: #fbfcfa; + --panel-2: #f5f7f4; + --ink: #1d2528; + --muted: #5b686d; + --line: #cdd7d8; + --teal: #08746f; + --teal-dark: #075854; + --red: #aa3d2d; + --amber: #a06010; + --green: #2d7434; + --shadow: 0 18px 45px rgba(31, 45, 49, 0.14); + --mono: "JetBrains Mono", "IBM Plex Mono", "Cascadia Mono", monospace; + --sans: "Aptos", "Source Sans 3", "Segoe UI", sans-serif; +} + +* { + box-sizing: border-box; +} + +body { + margin: 0; + min-height: 100vh; + background: + linear-gradient(135deg, rgba(8, 116, 111, 0.09), transparent 34%), + linear-gradient(315deg, rgba(170, 61, 45, 0.08), transparent 36%), + var(--bg); + color: var(--ink); + font-family: var(--sans); +} + +button, +select, +textarea { + font: inherit; +} + +button, +.secondary-link { + border: 1px solid var(--line); + background: var(--panel); + color: var(--ink); + min-height: 36px; + padding: 0 12px; + border-radius: 6px; + cursor: pointer; + text-decoration: none; + display: inline-flex; + align-items: center; + justify-content: center; + white-space: nowrap; +} + +button:hover, +.secondary-link:hover { + border-color: var(--teal); +} + +.topbar { + position: sticky; + top: 0; + z-index: 20; + display: grid; + grid-template-columns: minmax(280px, 1fr) minmax(260px, 420px) auto; + gap: 18px; + align-items: center; + padding: 14px 18px; + background: rgba(251, 252, 250, 0.94); + border-bottom: 1px solid var(--line); + backdrop-filter: blur(14px); +} + +.eyebrow, +.section-label { + color: var(--muted); + font-size: 11px; + font-weight: 700; + letter-spacing: 0; + text-transform: uppercase; +} + +.session-title { + font-size: 18px; + font-weight: 800; +} + +.session-meta, +.pane-subtitle, +.save-status { + color: var(--muted); + font-size: 12px; +} + +.progress-block { + display: grid; + gap: 7px; +} + +.progress-track { + width: 100%; + height: 8px; + overflow: hidden; + background: #dce3e4; + border-radius: 999px; +} + +.progress-track div { + width: 0%; + height: 100%; + background: linear-gradient(90deg, var(--teal), #6a8d28); + transition: width 160ms ease; +} + +.nav-actions, +.zoom-actions, +.panel-actions { + display: flex; + gap: 8px; + align-items: center; +} + +.icon-button { + width: 36px; + padding: 0; + font-weight: 800; +} + +.workspace { + display: grid; + grid-template-columns: minmax(340px, 1.05fr) minmax(340px, 1fr) 340px; + gap: 14px; + padding: 14px; + height: calc(100vh - 82px); +} + +.pane, +.annotation-panel { + min-height: 0; + background: var(--panel); + border: 1px solid var(--line); + border-radius: 8px; + box-shadow: var(--shadow); +} + +.pane { + display: grid; + grid-template-rows: auto minmax(0, 1fr); + overflow: hidden; +} + +.pane-toolbar { + display: flex; + justify-content: space-between; + gap: 12px; + align-items: center; + padding: 12px; + border-bottom: 1px solid var(--line); + background: var(--panel-2); +} + +.pane-title { + font-size: 15px; + font-weight: 800; +} + +.image-stage { + position: relative; + overflow: auto; + display: grid; + place-items: start center; + padding: 16px; + background: + linear-gradient(45deg, #dce3e4 25%, transparent 25%), + linear-gradient(-45deg, #dce3e4 25%, transparent 25%), + linear-gradient(45deg, transparent 75%, #dce3e4 75%), + linear-gradient(-45deg, transparent 75%, #dce3e4 75%); + background-size: 22px 22px; + background-position: 0 0, 0 11px, 11px -11px, -11px 0; +} + +#rawImage { + display: block; + max-width: none; + width: min(100%, 900px); + transform-origin: top center; + border: 1px solid #b7c3c5; + background: white; + box-shadow: 0 12px 28px rgba(31, 45, 49, 0.18); +} + +.missing-state { + margin: 40px auto; + padding: 20px; + border: 1px dashed var(--red); + background: #fff7f3; + color: var(--red); + border-radius: 8px; +} + +.markdown-preview, +.raw-markdown { + overflow: auto; + margin: 0; + padding: 18px; +} + +.markdown-preview { + line-height: 1.48; +} + +.markdown-preview h1, +.markdown-preview h2, +.markdown-preview h3 { + margin: 1.2em 0 0.45em; + line-height: 1.15; +} + +.markdown-preview table { + width: max-content; + max-width: 100%; + border-collapse: collapse; + margin: 14px 0; + font-size: 13px; +} + +.markdown-preview th, +.markdown-preview td { + border: 1px solid #b9c4c6; + padding: 6px 8px; + vertical-align: top; +} + +.markdown-preview th { + background: #e3eceb; +} + +.markdown-preview img { + max-width: 100%; + height: auto; + border: 1px solid var(--line); +} + +.raw-markdown { + font-family: var(--mono); + font-size: 12px; + line-height: 1.45; + white-space: pre-wrap; + background: #172225; + color: #e7eeed; +} + +.annotation-panel { + display: flex; + flex-direction: column; + overflow: auto; + padding: 12px; + gap: 12px; +} + +.panel-section { + display: grid; + gap: 10px; + padding-bottom: 12px; + border-bottom: 1px solid var(--line); +} + +.report-card h1 { + margin: 0; + font-size: 18px; + line-height: 1.2; +} + +dl { + display: grid; + gap: 7px; + margin: 0; +} + +dl div { + display: grid; + grid-template-columns: 78px minmax(0, 1fr); + gap: 8px; +} + +dt { + color: var(--muted); + font-size: 12px; +} + +dd { + margin: 0; + min-width: 0; + overflow-wrap: anywhere; + font-size: 12px; +} + +.decision-buttons { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 8px; +} + +.status-button[data-status="ok"].active { + background: var(--green); + border-color: var(--green); + color: white; +} + +.status-button[data-status="not_ok"].active { + background: var(--red); + border-color: var(--red); + color: white; +} + +.status-button[data-status="uncertain"].active { + background: var(--amber); + border-color: var(--amber); + color: white; +} + +.subchecks-section label { + display: grid; + grid-template-columns: 1fr 140px; + gap: 8px; + align-items: center; + font-size: 13px; +} + +select, +textarea { + width: 100%; + border: 1px solid var(--line); + border-radius: 6px; + background: white; + color: var(--ink); +} + +select { + min-height: 34px; +} + +textarea { + resize: vertical; + padding: 8px; +} + +.issue-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 8px; +} + +.issue-grid label { + display: flex; + align-items: center; + gap: 6px; + font-size: 12px; +} + +.primary-button { + background: var(--teal); + border-color: var(--teal); + color: white; + font-weight: 800; + flex: 1; +} + +.primary-button:hover { + background: var(--teal-dark); + border-color: var(--teal-dark); +} + +.save-status { + min-height: 20px; +} + +.help-dialog { + width: min(520px, calc(100vw - 32px)); + border: 1px solid var(--line); + border-radius: 8px; + box-shadow: var(--shadow); +} + +.dialog-header { + display: flex; + align-items: center; + justify-content: space-between; + gap: 12px; +} + +.dialog-header h2 { + margin: 0 0 12px; +} + +.shortcut-grid { + display: grid; + grid-template-columns: 90px minmax(0, 1fr); + gap: 8px 14px; +} + +.shortcut-grid span { + font-family: var(--mono); + font-weight: 800; +} + +.shortcut-grid p { + margin: 0; +} + +@media (max-width: 1180px) { + .topbar { + grid-template-columns: 1fr; + } + + .workspace { + height: auto; + min-height: calc(100vh - 82px); + grid-template-columns: 1fr; + } + + .pane { + min-height: 72vh; + } + + .annotation-panel { + min-height: 0; + } +} + +@media (max-width: 620px) { + + .nav-actions, + .pane-toolbar, + .panel-actions { + flex-wrap: wrap; + } + + .decision-buttons, + .issue-grid { + grid-template-columns: 1fr; + } + + .subchecks-section label, + dl div { + grid-template-columns: 1fr; + } +} \ No newline at end of file diff --git a/annotation_OCR/store.py b/annotation_OCR/store.py new file mode 100644 index 0000000..e2cbe3a --- /dev/null +++ b/annotation_OCR/store.py @@ -0,0 +1,400 @@ +"""File-backed session storage for OCR annotation runs.""" + +from __future__ import annotations + +import csv +import json +import re +import uuid +from collections import Counter +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +HERE = Path(__file__).resolve().parent +SESSIONS_DIR = HERE / "sessions" +SCHEMA_VERSION = "1.0" + +VALID_OVERALL_STATUS = {"ok", "not_ok", "uncertain", "unreviewed"} +VALID_SUBCHECK_STATUS = {"ok", "not_ok", "uncertain", "not_applicable", "unreviewed"} + +SUMMARY_FIELDS = [ + "session_id", + "session_name", + "annotator", + "item_id", + "industry_slug", + "report_name", + "exchange", + "ticker", + "year", + "page_index", + "page_number", + "overall_status", + "text_content", + "table_content", + "table_structure", + "page_alignment", + "issue_tags", + "notes", + "updated_at_utc", + "annotation_source", + "review_duration_ms", + "mapping_status", + "mapping_warnings", + "candidate_reasons", + "page_text_sha256", + "raw_png_path", + "mmd_path", +] + + +def utc_now() -> str: + return datetime.now(timezone.utc).isoformat(timespec="seconds") + + +def session_slug(value: str) -> str: + slug = re.sub(r"[^A-Za-z0-9_.-]+", "-", value.strip()).strip("-._") + return slug[:48] or "session" + + +def new_session_id(session_name: str | None = None) -> str: + prefix = session_slug(session_name or "session")[:24] + return f"{prefix}-{uuid.uuid4().hex[:12]}" + + +def atomic_write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(text, encoding="utf-8") + tmp.replace(path) + + +def atomic_write_json(path: Path, payload: Any) -> None: + atomic_write_text(path, json.dumps(payload, indent=2, ensure_ascii=False)) + + +def session_dir(session_id: str) -> Path: + return SESSIONS_DIR / session_id + + +def metadata_path(session_id: str) -> Path: + return session_dir(session_id) / "metadata.json" + + +def manifest_path(session_id: str) -> Path: + return session_dir(session_id) / "manifest.json" + + +def current_annotations_path(session_id: str) -> Path: + return session_dir(session_id) / "current_annotations.json" + + +def annotations_log_path(session_id: str) -> Path: + return session_dir(session_id) / "annotations.jsonl" + + +def create_session( + *, + session_name: str, + annotator: str, + manifest_items: list[dict[str, Any]], + index_summary: dict[str, Any], + config: dict[str, Any], + session_id: str | None = None, +) -> dict[str, Any]: + sid = session_id or new_session_id(session_name) + directory = session_dir(sid) + if directory.exists(): + raise FileExistsError(f"session already exists: {sid}") + directory.mkdir(parents=True, exist_ok=False) + + now = utc_now() + metadata = { + "schema_version": SCHEMA_VERSION, + "session_id": sid, + "session_name": session_name, + "annotator": annotator, + "created_at_utc": now, + "updated_at_utc": now, + "status": "active", + "item_count": len(manifest_items), + "completed_count": 0, + "index_summary": index_summary, + "config": config, + } + manifest = { + "schema_version": SCHEMA_VERSION, + "session_id": sid, + "created_at_utc": now, + "item_count": len(manifest_items), + "items": manifest_items, + } + + atomic_write_json(metadata_path(sid), metadata) + atomic_write_json(manifest_path(sid), manifest) + atomic_write_json(current_annotations_path(sid), {}) + annotations_log_path(sid).touch() + write_summary_files(sid) + return metadata + + +def load_json(path: Path, default: Any | None = None) -> Any: + if not path.is_file(): + return default + return json.loads(path.read_text(encoding="utf-8")) + + +def load_metadata(session_id: str) -> dict[str, Any]: + metadata = load_json(metadata_path(session_id)) + if metadata is None: + raise FileNotFoundError(f"unknown session: {session_id}") + return metadata + + +def load_manifest(session_id: str) -> list[dict[str, Any]]: + manifest = load_json(manifest_path(session_id)) + if manifest is None: + raise FileNotFoundError(f"unknown session manifest: {session_id}") + return manifest.get("items", []) + + +def load_current_annotations(session_id: str) -> dict[str, dict[str, Any]]: + return load_json(current_annotations_path(session_id), default={}) or {} + + +def list_sessions() -> list[dict[str, Any]]: + if not SESSIONS_DIR.is_dir(): + return [] + sessions: list[dict[str, Any]] = [] + for path in sorted(SESSIONS_DIR.iterdir()): + if not path.is_dir(): + continue + metadata = load_json(path / "metadata.json") + if isinstance(metadata, dict): + sessions.append(metadata) + sessions.sort(key=lambda rec: rec.get("updated_at_utc", ""), reverse=True) + return sessions + + +def manifest_index(session_id: str) -> dict[str, dict[str, Any]]: + return {item["item_id"]: item for item in load_manifest(session_id)} + + +def sanitize_status(value: Any, valid: set[str], default: str) -> str: + if isinstance(value, str) and value in valid: + return value + return default + + +def normalize_annotation_payload(payload: dict[str, Any]) -> dict[str, Any]: + subchecks = ( + payload.get("subchecks") if isinstance(payload.get("subchecks"), dict) else {} + ) + normalized_subchecks = { + "text_content": sanitize_status( + subchecks.get("text_content"), VALID_SUBCHECK_STATUS, "unreviewed" + ), + "table_content": sanitize_status( + subchecks.get("table_content"), VALID_SUBCHECK_STATUS, "unreviewed" + ), + "table_structure": sanitize_status( + subchecks.get("table_structure"), VALID_SUBCHECK_STATUS, "unreviewed" + ), + "page_alignment": sanitize_status( + subchecks.get("page_alignment"), VALID_SUBCHECK_STATUS, "unreviewed" + ), + } + + issue_tags = payload.get("issue_tags") + if not isinstance(issue_tags, list): + issue_tags = [] + issue_tags = sorted({str(tag).strip() for tag in issue_tags if str(tag).strip()}) + + return { + "overall_status": sanitize_status( + payload.get("overall_status"), VALID_OVERALL_STATUS, "unreviewed" + ), + "subchecks": normalized_subchecks, + "issue_tags": issue_tags, + "notes": str(payload.get("notes") or "").strip(), + "annotation_source": str(payload.get("annotation_source") or "manual"), + "review_duration_ms": payload.get("review_duration_ms"), + "client_started_at_utc": payload.get("client_started_at_utc"), + "client_updated_at_utc": payload.get("client_updated_at_utc"), + } + + +def next_log_sequence(path: Path) -> int: + if not path.is_file(): + return 1 + with path.open(encoding="utf-8") as handle: + return sum(1 for line in handle if line.strip()) + 1 + + +def save_annotation( + *, + session_id: str, + item_id: str, + payload: dict[str, Any], +) -> dict[str, Any]: + metadata = load_metadata(session_id) + items = manifest_index(session_id) + item = items.get(item_id) + if item is None: + raise KeyError(f"item not in session manifest: {item_id}") + + normalized = normalize_annotation_payload(payload) + now = utc_now() + log_path = annotations_log_path(session_id) + record = { + "schema_version": SCHEMA_VERSION, + "sequence": next_log_sequence(log_path), + "session_id": session_id, + "session_name": metadata.get("session_name"), + "annotator": metadata.get("annotator"), + "created_at_utc": now, + "updated_at_utc": now, + "item_id": item_id, + "industry_slug": item.get("industry_slug"), + "report_name": item.get("report_name"), + "exchange": item.get("exchange"), + "ticker": item.get("ticker"), + "year": item.get("year"), + "page_index": item.get("page_index"), + "page_number": item.get("page_number"), + "mmd_path": item.get("mmd_path"), + "raw_png_path": item.get("raw_png_path"), + "mapping_status": item.get("mapping_status"), + "mapping_warnings": item.get("mapping_warnings", []), + "candidate_reasons": item.get("candidate_reasons", []), + "page_text_sha256": item.get("page_text_sha256"), + **normalized, + } + + with log_path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(record, ensure_ascii=False) + "\n") + + current = load_current_annotations(session_id) + current[item_id] = record + atomic_write_json(current_annotations_path(session_id), current) + + completed_count = sum( + 1 for rec in current.values() if rec.get("overall_status") != "unreviewed" + ) + metadata["updated_at_utc"] = now + metadata["completed_count"] = completed_count + metadata["item_count"] = len(items) + atomic_write_json(metadata_path(session_id), metadata) + write_summary_files(session_id) + return record + + +def summary_rows(session_id: str) -> list[dict[str, Any]]: + metadata = load_metadata(session_id) + current = load_current_annotations(session_id) + rows: list[dict[str, Any]] = [] + for item in load_manifest(session_id): + annotation = current.get(item["item_id"], {}) + subchecks = annotation.get("subchecks", {}) if annotation else {} + rows.append( + { + "session_id": session_id, + "session_name": metadata.get("session_name", ""), + "annotator": metadata.get("annotator", ""), + "item_id": item.get("item_id"), + "industry_slug": item.get("industry_slug"), + "report_name": item.get("report_name"), + "exchange": item.get("exchange"), + "ticker": item.get("ticker"), + "year": item.get("year"), + "page_index": item.get("page_index"), + "page_number": item.get("page_number"), + "overall_status": annotation.get("overall_status", "unreviewed"), + "text_content": subchecks.get("text_content", "unreviewed"), + "table_content": subchecks.get("table_content", "unreviewed"), + "table_structure": subchecks.get("table_structure", "unreviewed"), + "page_alignment": subchecks.get("page_alignment", "unreviewed"), + "issue_tags": ";".join(annotation.get("issue_tags", [])), + "notes": annotation.get("notes", ""), + "updated_at_utc": annotation.get("updated_at_utc", ""), + "annotation_source": annotation.get("annotation_source", ""), + "review_duration_ms": annotation.get("review_duration_ms", ""), + "mapping_status": item.get("mapping_status"), + "mapping_warnings": ";".join(item.get("mapping_warnings", [])), + "candidate_reasons": ";".join(item.get("candidate_reasons", [])), + "page_text_sha256": item.get("page_text_sha256"), + "raw_png_path": item.get("raw_png_path"), + "mmd_path": item.get("mmd_path"), + } + ) + return rows + + +def write_summary_csv(path: Path, rows: list[dict[str, Any]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + with tmp.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter( + handle, fieldnames=SUMMARY_FIELDS, extrasaction="ignore" + ) + writer.writeheader() + writer.writerows(rows) + tmp.replace(path) + + +def write_summary_md(path: Path, rows: list[dict[str, Any]]) -> None: + metadata = load_metadata(path.parent.name) + status_counts = Counter(row["overall_status"] for row in rows) + issue_counts: Counter[str] = Counter() + for row in rows: + for tag in str(row.get("issue_tags") or "").split(";"): + if tag: + issue_counts[tag] += 1 + + reviewed = len(rows) - status_counts.get("unreviewed", 0) + lines = [ + f"# OCR Annotation Summary: {metadata.get('session_name', path.parent.name)}", + "", + f"- Session ID: `{path.parent.name}`", + f"- Annotator: `{metadata.get('annotator', '')}`", + f"- Items: {len(rows)}", + f"- Reviewed: {reviewed}", + f"- Updated: {metadata.get('updated_at_utc', '')}", + "", + "## Status Counts", + "", + "| Status | Count |", + "| --- | ---: |", + ] + for status, count in sorted(status_counts.items()): + lines.append(f"| {status} | {count} |") + + lines.extend(["", "## Issue Counts", "", "| Issue | Count |", "| --- | ---: |"]) + if issue_counts: + for issue, count in issue_counts.most_common(): + lines.append(f"| {issue} | {count} |") + else: + lines.append("| none | 0 |") + + atomic_write_text(path, "\n".join(lines) + "\n") + + +def write_summary_files(session_id: str) -> dict[str, str]: + rows = summary_rows(session_id) + directory = session_dir(session_id) + csv_path = directory / "summary.csv" + md_path = directory / "summary.md" + write_summary_csv(csv_path, rows) + write_summary_md(md_path, rows) + return {"summary_csv": str(csv_path), "summary_md": str(md_path)} + + +def write_all_sessions_summary(path: Path | None = None) -> Path: + out_path = path or (SESSIONS_DIR / "all_sessions_summary.csv") + rows: list[dict[str, Any]] = [] + for metadata in list_sessions(): + rows.extend(summary_rows(metadata["session_id"])) + write_summary_csv(out_path, rows) + return out_path diff --git a/annotation_OCR/summarize.py b/annotation_OCR/summarize.py new file mode 100644 index 0000000..31d565d --- /dev/null +++ b/annotation_OCR/summarize.py @@ -0,0 +1,58 @@ +"""Regenerate OCR annotation session summaries.""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from store import list_sessions, write_all_sessions_summary, write_summary_files + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Regenerate OCR annotation summaries.") + parser.add_argument("--session-id", action="append", default=[]) + parser.add_argument( + "--all", + action="store_true", + help="Regenerate summaries for every session under annotation_OCR/sessions.", + ) + parser.add_argument( + "--combined-output", + type=Path, + default=None, + help="Optional path for the combined all-sessions CSV.", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + session_ids = list(args.session_id) + if args.all: + session_ids.extend(metadata["session_id"] for metadata in list_sessions()) + + seen = set() + regenerated = [] + for session_id in session_ids: + if session_id in seen: + continue + seen.add(session_id) + regenerated.append( + {"session_id": session_id, **write_summary_files(session_id)} + ) + + combined = None + if args.all or args.combined_output: + combined = str(write_all_sessions_summary(args.combined_output)) + + print( + json.dumps( + {"regenerated": regenerated, "combined_summary_csv": combined}, indent=2 + ) + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/annotation_OCR/templates/index.html b/annotation_OCR/templates/index.html new file mode 100644 index 0000000..2c21e6c --- /dev/null +++ b/annotation_OCR/templates/index.html @@ -0,0 +1,204 @@ + + + + + + + OCR Annotation + + + + + + +
+
+
OCR annotation
+
Loading session
+
+
+ +
+
0 / 0 reviewed
+ +
+ + +
+ +
+
+
+
+
Raw page
+
+
+
+ + + +
+
+
+ Raw OCR source page + +
+
+ +
+
+
+
Extracted content
+
+
+ +
+
+ +
+ + +
+ + +
+
+

Keyboard

+ +
+
+ A +

OK, save, next

+ R +

Not OK, save, next

+ U +

Uncertain, save, next

+ J / K +

Next / previous

+ T +

Broken table

+ C +

Merged columns

+ M +

Missing text

+ + / - / 0 +

Zoom

+
+
+
+ + + \ No newline at end of file From 255ff285061148963e2d4f74d4ed19467823432a Mon Sep 17 00:00:00 2001 From: Charles Moslonka Date: Fri, 22 May 2026 11:54:35 +0200 Subject: [PATCH 4/8] DOC: explain OCR annotator --- annotation_OCR/README.md | 130 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 annotation_OCR/README.md diff --git a/annotation_OCR/README.md b/annotation_OCR/README.md new file mode 100644 index 0000000..598409e --- /dev/null +++ b/annotation_OCR/README.md @@ -0,0 +1,130 @@ +# OCR Annotation Interface + +Browser interface for comparing raw OCR page images with the corresponding Markdown page extracted by DeepSeekOCR. The app stores page-level annotations under `annotation_OCR/sessions/` so quality labels can later be joined to LLM benchmark outputs. + +## Run + +From the repository root: + +```bash +uv run python annotation_OCR/server.py \ + --session-name "table QA smoke" \ + --annotator "your-name" \ + --queue-mode table-candidates \ + --host 127.0.0.1 \ + --port 5050 +``` + +For a small smoke run: + +```bash +uv run python annotation_OCR/server.py \ + --session-name smoke \ + --annotator test \ + --queue-mode table-candidates \ + --limit-reports 2 \ + --limit 20 \ + --host 127.0.0.1 \ + --port 5050 +``` + +Resume an existing session: + +```bash +uv run python annotation_OCR/server.py --session-id SESSION_ID --host 127.0.0.1 --port 5050 +``` + +SSH port forwarding from a laptop: + +```bash +ssh -L 5050:127.0.0.1:5050 USER@SERVER +``` + +Then open `http://127.0.0.1:5050` locally. + +## Data Sources + +Defaults: + +- OCR Markdown root: `DeepSeekOCR_Ardian_pruned_1k/` +- Raw image root: `/data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs/` + +Each queued item maps one `.mmd` page split to the raw PNG with the same zero-based page index, for example page index `12` maps to `pages/page_0012.png`. The manifest records mapping warnings such as missing raw images or page-count mismatches. + +## Queue Modes + +- `table-candidates`: default. Keeps pages with table-like signals, dense numeric rows, financial statement headings, or KPI aliases. +- `all`: queues every page. +- `sample`: seeded random sample across all discovered pages. Use `--sample-size` and `--seed`. + +Indexer smoke check: + +```bash +uv run python annotation_OCR/ocr_index.py \ + --ocr-root DeepSeekOCR_Ardian_pruned_1k \ + --raw-root /data/workspace/charles/pdf_ocr_deepseek/DeepSeekOCR_Ardian_raw_3kdocs \ + --queue-mode table-candidates \ + --limit-reports 2 \ + --limit 20 \ + --check +``` + +## Keyboard + +- `a`: mark OK, save, advance +- `r`: mark Not OK, save, advance +- `u`: mark Uncertain, save, advance +- `j` / right arrow: next page +- `k` / left arrow: previous page +- `t`: toggle broken table +- `c`: toggle merged columns +- `m`: toggle missing text +- `+`, `-`, `0`: zoom controls +- `?`: shortcut dialog + +Shortcuts are disabled while typing in notes or editing form controls. + +## Outputs + +Each session writes to `annotation_OCR/sessions/{session_id}/`: + +- `metadata.json`: session name, annotator, configuration, counts, timestamps. +- `manifest.json`: queued pages and mapping diagnostics. +- `annotations.jsonl`: append-only event log, one saved annotation per line. +- `current_annotations.json`: latest annotation per item, written atomically. +- `summary.csv`: one row per queued page, including unreviewed pages. +- `summary.md`: status and issue-count overview. + +Regenerate summaries: + +```bash +uv run python annotation_OCR/summarize.py --session-id SESSION_ID +uv run python annotation_OCR/summarize.py --all +``` + +## Annotation Schema + +Primary fields: + +- `overall_status`: `ok`, `not_ok`, `uncertain`, or `unreviewed` +- `subchecks`: `text_content`, `table_content`, `table_structure`, `page_alignment` +- `issue_tags`: `missing_text`, `extra_text`, `wrong_reading_order`, `merged_columns`, `shifted_rows`, `missing_columns`, `broken_table`, `wrong_page`, `image_missing`, `low_confidence` +- `notes`: free text + +Identity fields include `industry_slug`, `report_name`, `exchange`, `ticker`, `year`, `page_index`, `page_number`, `mmd_path`, `raw_png_path`, and `page_text_sha256`. + +## Downstream Joins + +For page-level filtering, join annotation summaries on: + +```text +exchange, ticker, year, page_index +``` + +For report-level benchmark filtering, aggregate page labels to: + +```text +exchange, ticker, year +``` + +A conservative report-level rule is to exclude a report when any reviewed table-candidate page is `not_ok`, or when the share of `uncertain` pages exceeds a threshold chosen for the benchmark run. \ No newline at end of file From 8eb03e8f0f1fd5940d36a10768d289f491933df2 Mon Sep 17 00:00:00 2001 From: Charles Moslonka Date: Fri, 22 May 2026 19:37:39 +0200 Subject: [PATCH 5/8] ENH: let the user make the session name on start. --- annotation_OCR/README.md | 30 ++- annotation_OCR/server.py | 98 +++++++--- annotation_OCR/static/app.js | 125 +++++------- annotation_OCR/static/style.css | 61 +++++- annotation_OCR/store.py | 49 ----- annotation_OCR/templates/index.html | 89 ++------- annotation_OCR/templates/landing.html | 263 ++++++++++++++++++++++++++ 7 files changed, 474 insertions(+), 241 deletions(-) create mode 100644 annotation_OCR/templates/landing.html diff --git a/annotation_OCR/README.md b/annotation_OCR/README.md index 598409e..11e9641 100644 --- a/annotation_OCR/README.md +++ b/annotation_OCR/README.md @@ -4,6 +4,21 @@ Browser interface for comparing raw OCR page images with the corresponding Markd ## Run +### Headless mode (recommended for multi-user) + +Start the server with no session arguments โ€” annotators create/resume sessions +from the browser landing page: + +```bash +uv run python annotation_OCR/server.py --host 0.0.0.0 --port 5050 +``` + +Then open `http://HOST:5050`. The landing page lets each user enter their name, +create a new session, or resume an existing one. No CLI or Python knowledge +needed on the annotator side. + +### Pre-created session (single-user / scripted) + From the repository root: ```bash @@ -42,6 +57,8 @@ ssh -L 5050:127.0.0.1:5050 USER@SERVER Then open `http://127.0.0.1:5050` locally. +The extracted-content pane shows inline OCR images by default. Turn off `Inline images` if you want a lighter placeholder-only Markdown preview. + ## Data Sources Defaults: @@ -71,14 +88,11 @@ uv run python annotation_OCR/ocr_index.py \ ## Keyboard -- `a`: mark OK, save, advance -- `r`: mark Not OK, save, advance +- `a`: mark Yes, save, advance +- `r`: mark No, save, advance - `u`: mark Uncertain, save, advance - `j` / right arrow: next page - `k` / left arrow: previous page -- `t`: toggle broken table -- `c`: toggle merged columns -- `m`: toggle missing text - `+`, `-`, `0`: zoom controls - `?`: shortcut dialog @@ -93,7 +107,7 @@ Each session writes to `annotation_OCR/sessions/{session_id}/`: - `annotations.jsonl`: append-only event log, one saved annotation per line. - `current_annotations.json`: latest annotation per item, written atomically. - `summary.csv`: one row per queued page, including unreviewed pages. -- `summary.md`: status and issue-count overview. +- `summary.md`: status-count overview. Regenerate summaries: @@ -107,9 +121,7 @@ uv run python annotation_OCR/summarize.py --all Primary fields: - `overall_status`: `ok`, `not_ok`, `uncertain`, or `unreviewed` -- `subchecks`: `text_content`, `table_content`, `table_structure`, `page_alignment` -- `issue_tags`: `missing_text`, `extra_text`, `wrong_reading_order`, `merged_columns`, `shifted_rows`, `missing_columns`, `broken_table`, `wrong_page`, `image_missing`, `low_confidence` -- `notes`: free text +- `notes`: optional free text Identity fields include `industry_slug`, `report_name`, `exchange`, `ticker`, `year`, `page_index`, `page_number`, `mmd_path`, `raw_png_path`, and `page_text_sha256`. diff --git a/annotation_OCR/server.py b/annotation_OCR/server.py index 6ea3793..16460d5 100644 --- a/annotation_OCR/server.py +++ b/annotation_OCR/server.py @@ -10,7 +10,7 @@ import bleach import markdown as markdown_lib -from flask import Flask, abort, jsonify, render_template, request, send_file +from flask import Flask, abort, jsonify, redirect, render_template, request, send_file from ocr_index import DEFAULT_OCR_ROOT, DEFAULT_RAW_ROOT, build_queue, load_pages from store import ( @@ -27,7 +27,6 @@ HERE = Path(__file__).resolve().parent IMAGE_REF_RE = re.compile(r"(!\[[^\]]*\]\()((?:\./)?images/[^)\s]+)(\))") -HTML_IMAGE_SRC_RE = re.compile(r'(]*\bsrc=["\'])(images/[^"\']+)(["\'])', re.I) ALLOWED_TAGS = set(bleach.sanitizer.ALLOWED_TAGS).union( { @@ -130,8 +129,13 @@ def cached_pages(mmd_path: str) -> tuple[str, ...]: return tuple(load_pages(Path(mmd_path))) +@lru_cache(maxsize=16) +def cached_manifest(session_id: str) -> tuple[dict[str, Any], ...]: + return tuple(load_manifest(session_id)) + + def get_item_or_404(session_id: str, index: int) -> dict[str, Any]: - manifest = load_manifest(session_id) + manifest = cached_manifest(session_id) if index < 0 or index >= len(manifest): abort(404, description="item index out of range") return manifest[index] @@ -145,6 +149,12 @@ def item_page_text(item: dict[str, Any]) -> str: return pages[page_index] +def omit_markdown_image_refs(markdown_text: str) -> str: + return IMAGE_REF_RE.sub( + lambda match: f"_[image omitted: {match.group(2)}]_", markdown_text + ) + + def rewrite_markdown_image_refs(markdown_text: str, session_id: str, index: int) -> str: def replace_md(match: re.Match[str]) -> str: rel_path = match.group(2).lstrip("./") @@ -154,23 +164,22 @@ def replace_md(match: re.Match[str]) -> str: return IMAGE_REF_RE.sub(replace_md, markdown_text) -def rewrite_html_image_refs(html: str, session_id: str, index: int) -> str: - def replace_html(match: re.Match[str]) -> str: - rel_path = match.group(2).lstrip("./") - src = f"/api/session/{session_id}/item/{index}/inline-image/{rel_path}" - return f"{match.group(1)}{src}{match.group(3)}" - - return HTML_IMAGE_SRC_RE.sub(replace_html, html) - - -def render_markdown_page(markdown_text: str, session_id: str, index: int) -> str: - rewritten = rewrite_markdown_image_refs(markdown_text, session_id, index) +def render_markdown_page( + markdown_text: str, + *, + session_id: str, + index: int, + show_inline_images: bool, +) -> str: + if show_inline_images: + rewritten = rewrite_markdown_image_refs(markdown_text, session_id, index) + else: + rewritten = omit_markdown_image_refs(markdown_text) html = markdown_lib.markdown( rewritten, extensions=["tables", "fenced_code", "sane_lists", "nl2br"], output_format="html5", ) - html = rewrite_html_image_refs(html, session_id, index) return bleach.clean( html, tags=ALLOWED_TAGS, @@ -192,7 +201,7 @@ def safe_child_path(root: Path, relative_path: str) -> Path: def progress_payload(session_id: str) -> dict[str, Any]: metadata = load_metadata(session_id) - manifest = load_manifest(session_id) + manifest = cached_manifest(session_id) current = load_current_annotations(session_id) status_counts: dict[str, int] = {} for item in manifest: @@ -214,19 +223,30 @@ def progress_payload(session_id: str) -> dict[str, Any]: } -def create_app(default_session_id: str, build_defaults: dict[str, Any]) -> Flask: +def create_app(default_session_id: str | None, build_defaults: dict[str, Any]) -> Flask: app = Flask(__name__, template_folder="templates", static_folder="static") app.config["DEFAULT_SESSION_ID"] = default_session_id app.config["BUILD_DEFAULTS"] = build_defaults @app.get("/") - def index() -> str: - return render_template("index.html", default_session_id=default_session_id) + def index() -> Any: + # If ?session= in URL, serve the annotation UI for that session + session_from_url = request.args.get("session") + if session_from_url: + return render_template("index.html", session_id=session_from_url) + # If server was started with a pre-created session, redirect to it + if default_session_id: + return redirect(f"/?session={default_session_id}") + # Otherwise show the landing / session picker page + return render_template("landing.html") @app.get("/api/sessions") def api_sessions() -> Any: return jsonify( - {"sessions": list_sessions(), "default_session_id": default_session_id} + { + "sessions": list_sessions(), + "default_session_id": default_session_id or None, + } ) @app.post("/api/sessions") @@ -261,6 +281,7 @@ def api_create_session() -> Any: index_summary=index_summary, config=config, ) + cached_manifest.cache_clear() return jsonify( {"metadata": metadata, "progress": progress_payload(metadata["session_id"])} ) @@ -271,18 +292,30 @@ def api_session(session_id: str) -> Any: @app.get("/api/session//item/") def api_item(session_id: str, index: int) -> Any: + manifest = cached_manifest(session_id) item = get_item_or_404(session_id, index) text = item_page_text(item) annotations = load_current_annotations(session_id) + show_inline_images = request.args.get("inline_images", "1") != "0" + next_image_url = None + if index + 1 < len(manifest) and manifest[index + 1].get("raw_png_path"): + next_image_url = f"/api/session/{session_id}/item/{index + 1}/raw-image" return jsonify( { "index": index, - "item_count": len(load_manifest(session_id)), + "item_count": len(manifest), "item": item, "annotation": annotations.get(item["item_id"]), "page_text": text, - "markdown_html": render_markdown_page(text, session_id, index), + "markdown_html": render_markdown_page( + text, + session_id=session_id, + index=index, + show_inline_images=show_inline_images, + ), + "inline_images": show_inline_images, "image_url": f"/api/session/{session_id}/item/{index}/raw-image", + "next_image_url": next_image_url, } ) @@ -298,7 +331,7 @@ def api_raw_image(session_id: str, index: int) -> Any: abort(400, description="raw image outside raw root") if not target.is_file(): abort(404, description="raw page image missing") - return send_file(target) + return send_file(target, conditional=True, max_age=86400) @app.get("/api/session//item//inline-image/") def api_inline_image(session_id: str, index: int, rel_path: str) -> Any: @@ -307,7 +340,7 @@ def api_inline_image(session_id: str, index: int, rel_path: str) -> Any: target = safe_child_path(report_dir, rel_path) if not target.is_file(): abort(404, description="inline OCR image missing") - return send_file(target) + return send_file(target, conditional=True, max_age=86400) @app.post("/api/session//annotation") def api_save_annotation(session_id: str) -> Any: @@ -344,7 +377,15 @@ def api_summary_md(session_id: str) -> Any: def main(argv: list[str] | None = None) -> int: args = build_arg_parser().parse_args(argv) - session_id = prepare_session(args) + # Session creation is now optional โ€” if no --session-id given and + # --session-name is the default placeholder, start headless so users + # can create/resume sessions from the browser landing page. + session_id: str | None = None + if args.session_id: + session_id = prepare_session(args) + elif args.annotator != "anonymous" or args.session_name != "OCR annotation session": + session_id = prepare_session(args) + build_defaults = { "ocr_root": str(args.ocr_root), "raw_root": str(args.raw_root), @@ -355,7 +396,12 @@ def main(argv: list[str] | None = None) -> int: "limit_reports": args.limit_reports, } app = create_app(session_id, build_defaults) - print(f"Annotation session: {session_id}") + if session_id: + print(f"Annotation session: {session_id}") + else: + print( + "Starting in headless mode โ€” users will create sessions from the browser." + ) print(f"Open: http://{args.host}:{args.port}") app.run(host=args.host, port=args.port, debug=args.debug) return 0 diff --git a/annotation_OCR/static/app.js b/annotation_OCR/static/app.js index fd49ccc..880439d 100644 --- a/annotation_OCR/static/app.js +++ b/annotation_OCR/static/app.js @@ -1,5 +1,7 @@ const state = { - sessionId: window.OCR_ANNOTATION_DEFAULT_SESSION_ID, + sessionId: window.OCR_ANNOTATION_SESSION_ID + || new URLSearchParams(window.location.search).get('session') + || window.OCR_ANNOTATION_DEFAULT_SESSION_ID, index: 0, itemCount: 0, item: null, @@ -7,7 +9,9 @@ const state = { startedAt: null, zoom: 1, showingRaw: false, + showInlineImages: true, saving: false, + prefetchImage: null, }; const els = { @@ -19,12 +23,14 @@ const els = { nextButton: document.getElementById('nextButton'), skipReviewedButton: document.getElementById('skipReviewedButton'), helpButton: document.getElementById('helpButton'), + imageCanvas: document.getElementById('imageCanvas'), rawImage: document.getElementById('rawImage'), imageMissing: document.getElementById('imageMissing'), imageSubtitle: document.getElementById('imageSubtitle'), markdownSubtitle: document.getElementById('markdownSubtitle'), markdownPreview: document.getElementById('markdownPreview'), rawMarkdown: document.getElementById('rawMarkdown'), + inlineImagesToggle: document.getElementById('inlineImagesToggle'), toggleRawButton: document.getElementById('toggleRawButton'), zoomOutButton: document.getElementById('zoomOutButton'), zoomResetButton: document.getElementById('zoomResetButton'), @@ -36,7 +42,6 @@ const els = { signalsValue: document.getElementById('signalsValue'), mappingValue: document.getElementById('mappingValue'), notesInput: document.getElementById('notesInput'), - issueGrid: document.getElementById('issueGrid'), saveButton: document.getElementById('saveButton'), saveStatus: document.getElementById('saveStatus'), summaryCsvLink: document.getElementById('summaryCsvLink'), @@ -75,8 +80,7 @@ function updateProgress(progress) { const reviewed = progress.reviewed_count || 0; const total = progress.item_count || 0; els.progressText.textContent = `${reviewed} / ${total} reviewed`; - const pct = total ? Math.round((reviewed / total) * 100) : 0; - els.progressBar.style.width = `${pct}%`; + els.progressBar.style.width = `${total ? Math.round((reviewed / total) * 100) : 0}%`; els.summaryCsvLink.href = `/api/session/${state.sessionId}/summary.csv`; els.summaryMdLink.href = `/api/session/${state.sessionId}/summary.md`; } @@ -88,43 +92,23 @@ function setOverall(status) { }); } -function setSubchecks(values = {}) { - document.querySelectorAll('[data-subcheck]').forEach((select) => { - select.value = values[select.dataset.subcheck] || 'unreviewed'; - }); -} - -function setIssues(values = []) { - const selected = new Set(values); - els.issueGrid.querySelectorAll('input[type="checkbox"]').forEach((checkbox) => { - checkbox.checked = selected.has(checkbox.value); - }); -} - -function getSubchecks() { - const subchecks = {}; - document.querySelectorAll('[data-subcheck]').forEach((select) => { - subchecks[select.dataset.subcheck] = select.value; - }); - return subchecks; -} - -function getIssues() { - return Array.from(els.issueGrid.querySelectorAll('input[type="checkbox"]:checked')) - .map((checkbox) => checkbox.value) - .sort(); -} - function loadAnnotation(annotation) { setOverall(annotation?.overall_status || 'unreviewed'); - setSubchecks(annotation?.subchecks || {}); - setIssues(annotation?.issue_tags || []); els.notesInput.value = annotation?.notes || ''; } +function fittedImageWidth() { + const stage = els.imageCanvas.parentElement; + const availableWidth = Math.max(240, stage.clientWidth - 32); + const availableHeight = Math.max(240, stage.clientHeight - 32); + const naturalWidth = els.rawImage.naturalWidth || availableWidth; + const naturalHeight = els.rawImage.naturalHeight || naturalWidth * 1.414; + const fitScale = Math.min(availableWidth / naturalWidth, availableHeight / naturalHeight); + return Math.max(120, Math.floor(naturalWidth * fitScale)); +} + function applyZoom() { - els.rawImage.style.transform = `scale(${state.zoom})`; - els.rawImage.style.marginBottom = `${Math.max(0, (state.zoom - 1) * 100)}%`; + els.imageCanvas.style.setProperty('--image-width', `${Math.round(fittedImageWidth() * state.zoom)}px`); els.zoomResetButton.textContent = `${Math.round(state.zoom * 100)}%`; } @@ -139,9 +123,22 @@ async function loadProgress() { return progress; } +function prefetchNextImage(url) { + if (!url) return; + state.prefetchImage = new Image(); + state.prefetchImage.decoding = 'async'; + state.prefetchImage.src = url; +} + +function resetExtractedContentScroll() { + els.markdownPreview.scrollTop = 0; + els.rawMarkdown.scrollTop = 0; +} + async function loadItem(index) { const safeIndex = Math.max(0, Math.min(index, Math.max(0, state.itemCount - 1))); - const data = await apiJson(`/api/session/${state.sessionId}/item/${safeIndex}`); + const inlineFlag = state.showInlineImages ? '1' : '0'; + const data = await apiJson(`/api/session/${state.sessionId}/item/${safeIndex}?inline_images=${inlineFlag}`); state.index = safeIndex; state.item = data.item; state.itemCount = data.item_count; @@ -158,11 +155,13 @@ async function loadItem(index) { els.markdownPreview.innerHTML = data.markdown_html || ''; els.rawMarkdown.textContent = data.page_text || ''; + resetExtractedContentScroll(); if (data.item.raw_png_path) { els.rawImage.hidden = false; els.imageMissing.hidden = true; els.rawImage.src = `${data.image_url}?v=${encodeURIComponent(data.item.page_text_sha256)}`; + prefetchNextImage(data.next_image_url); } else { els.rawImage.hidden = true; els.imageMissing.hidden = false; @@ -180,8 +179,6 @@ function annotationPayload(source = 'manual') { return { item_id: state.item.item_id, overall_status: state.overallStatus, - subchecks: getSubchecks(), - issue_tags: getIssues(), notes: els.notesInput.value, annotation_source: source, review_duration_ms: state.startedAt ? new Date() - state.startedAt : null, @@ -214,40 +211,9 @@ async function saveAnnotation(source = 'manual', advance = false) { } } -function quickMark(status) { +function quickMark(status, source = 'shortcut') { setOverall(status); - if (status === 'ok') { - setSubchecks({ - text_content: 'ok', - table_content: 'ok', - table_structure: 'ok', - page_alignment: 'ok', - }); - setIssues([]); - } else if (status === 'not_ok') { - const subchecks = getSubchecks(); - if (Object.values(subchecks).every((value) => value === 'unreviewed')) { - setSubchecks({ - text_content: 'uncertain', - table_content: 'uncertain', - table_structure: 'not_ok', - page_alignment: 'uncertain', - }); - } - } else if (status === 'uncertain') { - setSubchecks({ - text_content: 'uncertain', - table_content: 'uncertain', - table_structure: 'uncertain', - page_alignment: 'uncertain', - }); - } - saveAnnotation(`shortcut:${status}`, true); -} - -function toggleIssue(tag) { - const checkbox = els.issueGrid.querySelector(`input[value="${tag}"]`); - if (checkbox) checkbox.checked = !checkbox.checked; + saveAnnotation(`${source}:${status}`, true); } async function go(delta) { @@ -283,13 +249,19 @@ function setupEvents() { els.nextButton.addEventListener('click', () => go(1)); els.skipReviewedButton.addEventListener('click', goNextOpen); els.saveButton.addEventListener('click', () => saveAnnotation('manual', false)); + els.inlineImagesToggle.addEventListener('change', () => { + state.showInlineImages = els.inlineImagesToggle.checked; + loadItem(state.index); + }); els.toggleRawButton.addEventListener('click', toggleRawMarkdown); els.zoomOutButton.addEventListener('click', () => setZoom(state.zoom - 0.15)); els.zoomInButton.addEventListener('click', () => setZoom(state.zoom + 0.15)); els.zoomResetButton.addEventListener('click', () => setZoom(1)); els.helpButton.addEventListener('click', () => els.helpDialog.showModal()); + els.rawImage.addEventListener('load', () => setZoom(1)); + window.addEventListener('resize', applyZoom); document.querySelectorAll('.status-button').forEach((button) => { - button.addEventListener('click', () => setOverall(button.dataset.status)); + button.addEventListener('click', () => quickMark(button.dataset.status, 'button')); }); document.addEventListener('keydown', (event) => { @@ -312,15 +284,6 @@ function setupEvents() { } else if (event.key === 'ArrowLeft' || event.key.toLowerCase() === 'k') { event.preventDefault(); go(-1); - } else if (event.key.toLowerCase() === 't') { - event.preventDefault(); - toggleIssue('broken_table'); - } else if (event.key.toLowerCase() === 'c') { - event.preventDefault(); - toggleIssue('merged_columns'); - } else if (event.key.toLowerCase() === 'm') { - event.preventDefault(); - toggleIssue('missing_text'); } else if (event.key === '+' || event.key === '=') { event.preventDefault(); setZoom(state.zoom + 0.15); diff --git a/annotation_OCR/static/style.css b/annotation_OCR/static/style.css index deaa160..a5e8305 100644 --- a/annotation_OCR/static/style.css +++ b/annotation_OCR/static/style.css @@ -92,6 +92,12 @@ button:hover, font-size: 12px; } +.pane-subtitle { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + .progress-block { display: grid; gap: 7px; @@ -118,6 +124,7 @@ button:hover, display: flex; gap: 8px; align-items: center; + flex: 0 0 auto; } .icon-button { @@ -128,7 +135,7 @@ button:hover, .workspace { display: grid; - grid-template-columns: minmax(340px, 1.05fr) minmax(340px, 1fr) 340px; + grid-template-columns: minmax(280px, 1.05fr) minmax(280px, 1fr) minmax(280px, 340px); gap: 14px; padding: 14px; height: calc(100vh - 82px); @@ -136,6 +143,7 @@ button:hover, .pane, .annotation-panel { + min-width: 0; min-height: 0; background: var(--panel); border: 1px solid var(--line); @@ -154,11 +162,47 @@ button:hover, justify-content: space-between; gap: 12px; align-items: center; + min-width: 0; + overflow: hidden; padding: 12px; border-bottom: 1px solid var(--line); background: var(--panel-2); } +.pane-toolbar>div:first-child { + flex: 1 1 auto; + min-width: 0; +} + +.zoom-actions { + margin-left: auto; +} + +.preview-actions { + display: flex; + flex: 0 0 auto; + align-items: center; + gap: 10px; +} + +.toggle-control { + display: inline-flex; + align-items: center; + gap: 6px; + color: var(--muted); + font-size: 12px; + white-space: nowrap; +} + +.zoom-actions button { + width: 36px; + padding: 0; +} + +.zoom-actions #zoomResetButton { + width: 58px; +} + .pane-title { font-size: 15px; font-weight: 800; @@ -167,8 +211,7 @@ button:hover, .image-stage { position: relative; overflow: auto; - display: grid; - place-items: start center; + display: block; padding: 16px; background: linear-gradient(45deg, #dce3e4 25%, transparent 25%), @@ -179,11 +222,19 @@ button:hover, background-position: 0 0, 0 11px, 11px -11px, -11px 0; } +.image-canvas { + --image-width: 320px; + display: flex; + justify-content: center; + align-items: flex-start; + min-width: max(100%, var(--image-width)); + min-height: 100%; +} + #rawImage { display: block; + width: var(--image-width); max-width: none; - width: min(100%, 900px); - transform-origin: top center; border: 1px solid #b7c3c5; background: white; box-shadow: 0 12px 28px rgba(31, 45, 49, 0.18); diff --git a/annotation_OCR/store.py b/annotation_OCR/store.py index e2cbe3a..32916b6 100644 --- a/annotation_OCR/store.py +++ b/annotation_OCR/store.py @@ -17,7 +17,6 @@ SCHEMA_VERSION = "1.0" VALID_OVERALL_STATUS = {"ok", "not_ok", "uncertain", "unreviewed"} -VALID_SUBCHECK_STATUS = {"ok", "not_ok", "uncertain", "not_applicable", "unreviewed"} SUMMARY_FIELDS = [ "session_id", @@ -32,11 +31,6 @@ "page_index", "page_number", "overall_status", - "text_content", - "table_content", - "table_structure", - "page_alignment", - "issue_tags", "notes", "updated_at_utc", "annotation_source", @@ -189,35 +183,10 @@ def sanitize_status(value: Any, valid: set[str], default: str) -> str: def normalize_annotation_payload(payload: dict[str, Any]) -> dict[str, Any]: - subchecks = ( - payload.get("subchecks") if isinstance(payload.get("subchecks"), dict) else {} - ) - normalized_subchecks = { - "text_content": sanitize_status( - subchecks.get("text_content"), VALID_SUBCHECK_STATUS, "unreviewed" - ), - "table_content": sanitize_status( - subchecks.get("table_content"), VALID_SUBCHECK_STATUS, "unreviewed" - ), - "table_structure": sanitize_status( - subchecks.get("table_structure"), VALID_SUBCHECK_STATUS, "unreviewed" - ), - "page_alignment": sanitize_status( - subchecks.get("page_alignment"), VALID_SUBCHECK_STATUS, "unreviewed" - ), - } - - issue_tags = payload.get("issue_tags") - if not isinstance(issue_tags, list): - issue_tags = [] - issue_tags = sorted({str(tag).strip() for tag in issue_tags if str(tag).strip()}) - return { "overall_status": sanitize_status( payload.get("overall_status"), VALID_OVERALL_STATUS, "unreviewed" ), - "subchecks": normalized_subchecks, - "issue_tags": issue_tags, "notes": str(payload.get("notes") or "").strip(), "annotation_source": str(payload.get("annotation_source") or "manual"), "review_duration_ms": payload.get("review_duration_ms"), @@ -297,7 +266,6 @@ def summary_rows(session_id: str) -> list[dict[str, Any]]: rows: list[dict[str, Any]] = [] for item in load_manifest(session_id): annotation = current.get(item["item_id"], {}) - subchecks = annotation.get("subchecks", {}) if annotation else {} rows.append( { "session_id": session_id, @@ -312,11 +280,6 @@ def summary_rows(session_id: str) -> list[dict[str, Any]]: "page_index": item.get("page_index"), "page_number": item.get("page_number"), "overall_status": annotation.get("overall_status", "unreviewed"), - "text_content": subchecks.get("text_content", "unreviewed"), - "table_content": subchecks.get("table_content", "unreviewed"), - "table_structure": subchecks.get("table_structure", "unreviewed"), - "page_alignment": subchecks.get("page_alignment", "unreviewed"), - "issue_tags": ";".join(annotation.get("issue_tags", [])), "notes": annotation.get("notes", ""), "updated_at_utc": annotation.get("updated_at_utc", ""), "annotation_source": annotation.get("annotation_source", ""), @@ -347,11 +310,6 @@ def write_summary_csv(path: Path, rows: list[dict[str, Any]]) -> None: def write_summary_md(path: Path, rows: list[dict[str, Any]]) -> None: metadata = load_metadata(path.parent.name) status_counts = Counter(row["overall_status"] for row in rows) - issue_counts: Counter[str] = Counter() - for row in rows: - for tag in str(row.get("issue_tags") or "").split(";"): - if tag: - issue_counts[tag] += 1 reviewed = len(rows) - status_counts.get("unreviewed", 0) lines = [ @@ -371,13 +329,6 @@ def write_summary_md(path: Path, rows: list[dict[str, Any]]) -> None: for status, count in sorted(status_counts.items()): lines.append(f"| {status} | {count} |") - lines.extend(["", "## Issue Counts", "", "| Issue | Count |", "| --- | ---: |"]) - if issue_counts: - for issue, count in issue_counts.most_common(): - lines.append(f"| {issue} | {count} |") - else: - lines.append("| none | 0 |") - atomic_write_text(path, "\n".join(lines) + "\n") diff --git a/annotation_OCR/templates/index.html b/annotation_OCR/templates/index.html index 2c21e6c..4254140 100644 --- a/annotation_OCR/templates/index.html +++ b/annotation_OCR/templates/index.html @@ -7,7 +7,7 @@ OCR Annotation @@ -28,6 +28,7 @@