Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -138,5 +138,15 @@ RUN python3.12 -m venv "$VIRTUAL_ENV" && "$VIRTUAL_ENV/bin/python" && "$VIRTUAL_
# Byte-compile using venv python
RUN "$VIRTUAL_ENV/bin/python" -m compileall /ocr_service

# Run as non-root by default for Kubernetes restricted policies.
ARG OCR_SERVICE_UID=10001
ARG OCR_SERVICE_GID=10001
RUN groupadd --system --gid "$OCR_SERVICE_GID" ocrsvc && \
useradd --system --uid "$OCR_SERVICE_UID" --gid "$OCR_SERVICE_GID" --create-home --home-dir /home/ocrsvc --shell /usr/sbin/nologin ocrsvc && \
mkdir -p /ocr_service/tmp /ocr_service/log && \
chown -R ocrsvc:ocrsvc /ocr_service/tmp /ocr_service/log /home/ocrsvc
ENV HOME=/home/ocrsvc
USER ocrsvc

# Now run the simple api
CMD ["/bin/bash", "start_service_production.sh"]
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ Check `http://localhost:8090/docs` for input information.
The service exposes:

- *GET* `/api/health` - returns `{"status": "healthy"}`,
- *GET* `/api/ready` - returns readiness for OCR processing (`200` when ready, `503` when not ready),
- *GET* `/api/info` - returns information about the service with its configuration,
- *POST* `/api/process` - processes a binary data stream with the binary document content ("Content-Type: application/octet-stream"), also accepts binary files directly via the 'file' parameter, if sending via curl. It
- *POST* `/api/process_file` - processes a file via multipart/form-data,
Expand Down Expand Up @@ -260,7 +261,8 @@ Gunicorn/runtime env vars used by start scripts and docker:
```text
OCR_SERVICE_HOST - bind host (default "0.0.0.0" in env templates)
OCR_SERVICE_WORKER_CLASS - "sync" or "gthread" (env default is "sync")
OCR_SERVICE_GUNICORN_LOG_FILE_PATH, OCR_SERVICE_GUNICORN_LOG_LEVEL
OCR_SERVICE_GUNICORN_LOG_FILE_PATH - Gunicorn access log target (use "-" for stdout; default in env templates)
OCR_SERVICE_GUNICORN_LOG_LEVEL
OCR_SERVICE_GUNICORN_MAX_REQUESTS, OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER
OCR_SERVICE_GUNICORN_TIMEOUT, OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT
```
3 changes: 2 additions & 1 deletion env/ocr_service.env
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ OCR_WEB_SERVICE_THREADS=1
OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER=5000
OCR_SERVICE_GUNICORN_MAX_REQUESTS=50000

OCR_SERVICE_GUNICORN_LOG_FILE_PATH="./log/ocr_service.log"
# Gunicorn access logs: "-" means stdout (recommended for Docker/Kubernetes)
OCR_SERVICE_GUNICORN_LOG_FILE_PATH="-"

OCR_SERVICE_PORT=8090

Expand Down
68 changes: 67 additions & 1 deletion ocr_service/api/health.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from fastapi import APIRouter
from typing import Any

import psutil
from fastapi import APIRouter, Request, status
from fastapi.responses import ORJSONResponse

from ocr_service.dto.info_response import InfoResponse
Expand All @@ -12,6 +15,69 @@ def health() -> ORJSONResponse:
return ORJSONResponse(content={"status": "healthy"})


def _collect_readiness_issues(request: Request) -> list[str]:
issues: list[str] = []

processor = getattr(request.app.state, "processor", None)
if processor is None:
return ["processor_not_initialized"]

loffice_process_list = getattr(processor, "loffice_process_list", None)
if not isinstance(loffice_process_list, dict) or len(loffice_process_list) == 0:
return ["libreoffice_process_list_empty"]

for port, proc_data in loffice_process_list.items():
if not isinstance(proc_data, dict):
issues.append(f"libreoffice_process_invalid_metadata:{port}")
continue

if proc_data.get("unhealthy"):
issues.append(f"libreoffice_process_marked_unhealthy:{port}")
continue

process_obj = proc_data.get("process")
process_pid: Any = getattr(process_obj, "pid", None) or proc_data.get("pid")

try:
process_pid = int(process_pid)
except (TypeError, ValueError):
issues.append(f"libreoffice_process_missing_pid:{port}")
continue

if process_obj is not None and hasattr(process_obj, "poll") and process_obj.poll() is not None:
issues.append(f"libreoffice_process_exited:{port}")
continue

if not psutil.pid_exists(process_pid):
issues.append(f"libreoffice_process_pid_not_found:{port}")
continue

try:
lo_process = psutil.Process(process_pid)
if not lo_process.is_running():
issues.append(f"libreoffice_process_not_running:{port}")
continue
if lo_process.status() == psutil.STATUS_ZOMBIE:
issues.append(f"libreoffice_process_zombie:{port}")
except psutil.Error:
issues.append(f"libreoffice_process_not_accessible:{port}")

return issues


@health_api.get("/ready", response_class=ORJSONResponse)
def ready(request: Request) -> ORJSONResponse:
issues = _collect_readiness_issues(request)
if len(issues) > 0:
return ORJSONResponse(
content={"status": "not_ready", "issues": issues},
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
)

process_count = len(request.app.state.processor.loffice_process_list)
return ORJSONResponse(content={"status": "ready", "libreoffice_processes": process_count})


@health_api.get("/info", response_model=InfoResponse, response_class=ORJSONResponse)
def info() -> ORJSONResponse:
return ORJSONResponse(content=get_app_info())
91 changes: 91 additions & 0 deletions ocr_service/tests/test_health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import unittest
from unittest.mock import patch

from fastapi import FastAPI
from fastapi.testclient import TestClient

from ocr_service.api.health import health_api


class DummySubprocess:
def __init__(self, pid: int, returncode: int | None = None) -> None:
self.pid = pid
self._returncode = returncode

def poll(self) -> int | None:
return self._returncode


class DummyPsutilProcess:
def __init__(self, running: bool, process_status: str) -> None:
self._running = running
self._status = process_status

def is_running(self) -> bool:
return self._running

def status(self) -> str:
return self._status


class DummyProcessor:
def __init__(self, loffice_process_list):
self.loffice_process_list = loffice_process_list


class TestHealthApi(unittest.TestCase):
def setUp(self) -> None:
self.app = FastAPI()
self.app.include_router(health_api)
self.client = TestClient(self.app)

def tearDown(self) -> None:
self.client.close()

def test_health_returns_healthy(self):
response = self.client.get("/api/health")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.json(), {"status": "healthy"})

def test_ready_returns_503_when_processor_not_initialized(self):
response = self.client.get("/api/ready")
self.assertEqual(response.status_code, 503)
data = response.json()
self.assertEqual(data.get("status"), "not_ready")
self.assertIn("processor_not_initialized", data.get("issues", []))

def test_ready_returns_503_when_libreoffice_process_exited(self):
self.app.state.processor = DummyProcessor(
{
"9900": {
"process": DummySubprocess(pid=12345, returncode=1),
"pid": 12345,
"unhealthy": False,
}
}
)

response = self.client.get("/api/ready")
self.assertEqual(response.status_code, 503)
data = response.json()
self.assertEqual(data.get("status"), "not_ready")
self.assertIn("libreoffice_process_exited:9900", data.get("issues", []))

@patch("ocr_service.api.health.psutil.Process")
@patch("ocr_service.api.health.psutil.pid_exists", return_value=True)
def test_ready_returns_200_for_running_libreoffice_process(self, _pid_exists, process_mock):
process_mock.return_value = DummyPsutilProcess(running=True, process_status="sleeping")
self.app.state.processor = DummyProcessor(
{
"9900": {
"process": DummySubprocess(pid=12345, returncode=None),
"pid": 12345,
"unhealthy": False,
}
}
)

response = self.client.get("/api/ready")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.json(), {"status": "ready", "libreoffice_processes": 1})

24 changes: 13 additions & 11 deletions start_service_debug.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ export OCR_SERVICE_PORT="${OCR_SERVICE_PORT:-8090}"
export OCR_SERVICE_WORKER_CLASS="${OCR_SERVICE_WORKER_CLASS:-sync}"
export OCR_WEB_SERVICE_WORKERS="${OCR_WEB_SERVICE_WORKERS:-1}"
export OCR_SERVICE_LOG_LEVEL="${OCR_SERVICE_LOG_LEVEL:-20}"
export OCR_SERVICE_GUNICORN_LOG_FILE_PATH="${OCR_SERVICE_GUNICORN_LOG_FILE_PATH:-./log/ocr_service.log}"
export OCR_SERVICE_GUNICORN_LOG_FILE_PATH="${OCR_SERVICE_GUNICORN_LOG_FILE_PATH:--}"
export OCR_SERVICE_GUNICORN_LOG_LEVEL="${OCR_SERVICE_GUNICORN_LOG_LEVEL:-info}"
export OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER="${OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER:-5000}"
export OCR_SERVICE_GUNICORN_MAX_REQUESTS="${OCR_SERVICE_GUNICORN_MAX_REQUESTS:-50000}"
Expand Down Expand Up @@ -52,13 +52,15 @@ else
exit 1
fi

$python_version -m gunicorn wsgi:app --worker-class "$OCR_SERVICE_WORKER_CLASS" \
--bind "$OCR_SERVICE_HOST:$OCR_SERVICE_PORT" \
--threads "1" \
--workers "$OCR_WEB_SERVICE_WORKERS" \
--access-logfile "$OCR_SERVICE_GUNICORN_LOG_FILE_PATH" \
--reload --log-level "debug" \
--max-requests "$OCR_SERVICE_GUNICORN_MAX_REQUESTS" \
--max-requests-jitter "$OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER" \
--timeout "$OCR_SERVICE_GUNICORN_TIMEOUT" \
--graceful-timeout "$OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT"
gunicorn_cmd=("$python_version" "-m" "gunicorn")

exec "${gunicorn_cmd[@]}" wsgi:app --worker-class "$OCR_SERVICE_WORKER_CLASS" \
--bind "$OCR_SERVICE_HOST:$OCR_SERVICE_PORT" \
--threads "1" \
--workers "$OCR_WEB_SERVICE_WORKERS" \
--access-logfile "$OCR_SERVICE_GUNICORN_LOG_FILE_PATH" \
--reload --log-level "debug" \
--max-requests "$OCR_SERVICE_GUNICORN_MAX_REQUESTS" \
--max-requests-jitter "$OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER" \
--timeout "$OCR_SERVICE_GUNICORN_TIMEOUT" \
--graceful-timeout "$OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT"
26 changes: 13 additions & 13 deletions start_service_production.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ export OCR_SERVICE_PORT="${OCR_SERVICE_PORT:-8090}"
export OCR_SERVICE_WORKER_CLASS="${OCR_SERVICE_WORKER_CLASS:-sync}"
export OCR_WEB_SERVICE_WORKERS="${OCR_WEB_SERVICE_WORKERS:-1}"
export OCR_SERVICE_LOG_LEVEL="${OCR_SERVICE_LOG_LEVEL:-20}"
export OCR_SERVICE_GUNICORN_LOG_FILE_PATH="${OCR_SERVICE_GUNICORN_LOG_FILE_PATH:-./log/ocr_service.log}"
export OCR_SERVICE_GUNICORN_LOG_FILE_PATH="${OCR_SERVICE_GUNICORN_LOG_FILE_PATH:--}"
export OCR_SERVICE_GUNICORN_LOG_LEVEL="${OCR_SERVICE_GUNICORN_LOG_LEVEL:-info}"
export OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER="${OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER:-5000}"
export OCR_SERVICE_GUNICORN_MAX_REQUESTS="${OCR_SERVICE_GUNICORN_MAX_REQUESTS:-50000}"
Expand Down Expand Up @@ -36,7 +36,7 @@ if [[ -x "$VIRTUAL_ENV/bin/gunicorn" ]]; then
export VIRTUAL_ENV
export PATH="$VIRTUAL_ENV/bin:$PATH"
python_cmd="$VIRTUAL_ENV/bin/python"
gunicorn_cmd="$VIRTUAL_ENV/bin/gunicorn"
gunicorn_cmd=("$VIRTUAL_ENV/bin/gunicorn")
else
# Fallback to system python if venv missing
python_cmd=python3
Expand All @@ -47,16 +47,16 @@ else
elif command -v python3.13 &>/dev/null; then
python_cmd=python3.13
fi
gunicorn_cmd="$python_cmd -m gunicorn"
gunicorn_cmd=("$python_cmd" "-m" "gunicorn")
fi

$gunicorn_cmd wsgi:app --worker-class "$OCR_SERVICE_WORKER_CLASS" \
--bind "$OCR_SERVICE_HOST:$OCR_SERVICE_PORT" \
--threads "1" \
--workers "$OCR_WEB_SERVICE_WORKERS" \
--access-logfile "$OCR_SERVICE_GUNICORN_LOG_FILE_PATH" \
--log-level "$OCR_SERVICE_GUNICORN_LOG_LEVEL" \
--max-requests "$OCR_SERVICE_GUNICORN_MAX_REQUESTS" \
--max-requests-jitter "$OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER" \
--timeout "$OCR_SERVICE_GUNICORN_TIMEOUT" \
--graceful-timeout "$OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT"
exec "${gunicorn_cmd[@]}" wsgi:app --worker-class "$OCR_SERVICE_WORKER_CLASS" \
--bind "$OCR_SERVICE_HOST:$OCR_SERVICE_PORT" \
--threads "1" \
--workers "$OCR_WEB_SERVICE_WORKERS" \
--access-logfile "$OCR_SERVICE_GUNICORN_LOG_FILE_PATH" \
--log-level "$OCR_SERVICE_GUNICORN_LOG_LEVEL" \
--max-requests "$OCR_SERVICE_GUNICORN_MAX_REQUESTS" \
--max-requests-jitter "$OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER" \
--timeout "$OCR_SERVICE_GUNICORN_TIMEOUT" \
--graceful-timeout "$OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT"