diff --git a/Dockerfile b/Dockerfile index 21caee3..4d3d521 100644 --- a/Dockerfile +++ b/Dockerfile @@ -138,5 +138,15 @@ RUN python3.12 -m venv "$VIRTUAL_ENV" && "$VIRTUAL_ENV/bin/python" && "$VIRTUAL_ # Byte-compile using venv python RUN "$VIRTUAL_ENV/bin/python" -m compileall /ocr_service +# Run as non-root by default for Kubernetes restricted policies. +ARG OCR_SERVICE_UID=10001 +ARG OCR_SERVICE_GID=10001 +RUN groupadd --system --gid "$OCR_SERVICE_GID" ocrsvc && \ + useradd --system --uid "$OCR_SERVICE_UID" --gid "$OCR_SERVICE_GID" --create-home --home-dir /home/ocrsvc --shell /usr/sbin/nologin ocrsvc && \ + mkdir -p /ocr_service/tmp /ocr_service/log && \ + chown -R ocrsvc:ocrsvc /ocr_service/tmp /ocr_service/log /home/ocrsvc +ENV HOME=/home/ocrsvc +USER ocrsvc + # Now run the simple api CMD ["/bin/bash", "start_service_production.sh"] diff --git a/README.md b/README.md index 6eb1b7e..4292c87 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ Check `http://localhost:8090/docs` for input information. The service exposes: - *GET* `/api/health` - returns `{"status": "healthy"}`, +- *GET* `/api/ready` - returns readiness for OCR processing (`200` when ready, `503` when not ready), - *GET* `/api/info` - returns information about the service with its configuration, - *POST* `/api/process` - processes a binary data stream with the binary document content ("Content-Type: application/octet-stream"), also accepts binary files directly via the 'file' parameter, if sending via curl. It - *POST* `/api/process_file` - processes a file via multipart/form-data, @@ -260,7 +261,8 @@ Gunicorn/runtime env vars used by start scripts and docker: ```text OCR_SERVICE_HOST - bind host (default "0.0.0.0" in env templates) OCR_SERVICE_WORKER_CLASS - "sync" or "gthread" (env default is "sync") -OCR_SERVICE_GUNICORN_LOG_FILE_PATH, OCR_SERVICE_GUNICORN_LOG_LEVEL +OCR_SERVICE_GUNICORN_LOG_FILE_PATH - Gunicorn access log target (use "-" for stdout; default in env templates) +OCR_SERVICE_GUNICORN_LOG_LEVEL OCR_SERVICE_GUNICORN_MAX_REQUESTS, OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER OCR_SERVICE_GUNICORN_TIMEOUT, OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT ``` diff --git a/env/ocr_service.env b/env/ocr_service.env index 87d6e4f..4758ecd 100644 --- a/env/ocr_service.env +++ b/env/ocr_service.env @@ -42,7 +42,8 @@ OCR_WEB_SERVICE_THREADS=1 OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER=5000 OCR_SERVICE_GUNICORN_MAX_REQUESTS=50000 -OCR_SERVICE_GUNICORN_LOG_FILE_PATH="./log/ocr_service.log" +# Gunicorn access logs: "-" means stdout (recommended for Docker/Kubernetes) +OCR_SERVICE_GUNICORN_LOG_FILE_PATH="-" OCR_SERVICE_PORT=8090 diff --git a/ocr_service/api/health.py b/ocr_service/api/health.py index 8138647..13021a1 100644 --- a/ocr_service/api/health.py +++ b/ocr_service/api/health.py @@ -1,4 +1,7 @@ -from fastapi import APIRouter +from typing import Any + +import psutil +from fastapi import APIRouter, Request, status from fastapi.responses import ORJSONResponse from ocr_service.dto.info_response import InfoResponse @@ -12,6 +15,69 @@ def health() -> ORJSONResponse: return ORJSONResponse(content={"status": "healthy"}) +def _collect_readiness_issues(request: Request) -> list[str]: + issues: list[str] = [] + + processor = getattr(request.app.state, "processor", None) + if processor is None: + return ["processor_not_initialized"] + + loffice_process_list = getattr(processor, "loffice_process_list", None) + if not isinstance(loffice_process_list, dict) or len(loffice_process_list) == 0: + return ["libreoffice_process_list_empty"] + + for port, proc_data in loffice_process_list.items(): + if not isinstance(proc_data, dict): + issues.append(f"libreoffice_process_invalid_metadata:{port}") + continue + + if proc_data.get("unhealthy"): + issues.append(f"libreoffice_process_marked_unhealthy:{port}") + continue + + process_obj = proc_data.get("process") + process_pid: Any = getattr(process_obj, "pid", None) or proc_data.get("pid") + + try: + process_pid = int(process_pid) + except (TypeError, ValueError): + issues.append(f"libreoffice_process_missing_pid:{port}") + continue + + if process_obj is not None and hasattr(process_obj, "poll") and process_obj.poll() is not None: + issues.append(f"libreoffice_process_exited:{port}") + continue + + if not psutil.pid_exists(process_pid): + issues.append(f"libreoffice_process_pid_not_found:{port}") + continue + + try: + lo_process = psutil.Process(process_pid) + if not lo_process.is_running(): + issues.append(f"libreoffice_process_not_running:{port}") + continue + if lo_process.status() == psutil.STATUS_ZOMBIE: + issues.append(f"libreoffice_process_zombie:{port}") + except psutil.Error: + issues.append(f"libreoffice_process_not_accessible:{port}") + + return issues + + +@health_api.get("/ready", response_class=ORJSONResponse) +def ready(request: Request) -> ORJSONResponse: + issues = _collect_readiness_issues(request) + if len(issues) > 0: + return ORJSONResponse( + content={"status": "not_ready", "issues": issues}, + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + ) + + process_count = len(request.app.state.processor.loffice_process_list) + return ORJSONResponse(content={"status": "ready", "libreoffice_processes": process_count}) + + @health_api.get("/info", response_model=InfoResponse, response_class=ORJSONResponse) def info() -> ORJSONResponse: return ORJSONResponse(content=get_app_info()) diff --git a/ocr_service/tests/test_health.py b/ocr_service/tests/test_health.py new file mode 100644 index 0000000..49adad0 --- /dev/null +++ b/ocr_service/tests/test_health.py @@ -0,0 +1,91 @@ +import unittest +from unittest.mock import patch + +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from ocr_service.api.health import health_api + + +class DummySubprocess: + def __init__(self, pid: int, returncode: int | None = None) -> None: + self.pid = pid + self._returncode = returncode + + def poll(self) -> int | None: + return self._returncode + + +class DummyPsutilProcess: + def __init__(self, running: bool, process_status: str) -> None: + self._running = running + self._status = process_status + + def is_running(self) -> bool: + return self._running + + def status(self) -> str: + return self._status + + +class DummyProcessor: + def __init__(self, loffice_process_list): + self.loffice_process_list = loffice_process_list + + +class TestHealthApi(unittest.TestCase): + def setUp(self) -> None: + self.app = FastAPI() + self.app.include_router(health_api) + self.client = TestClient(self.app) + + def tearDown(self) -> None: + self.client.close() + + def test_health_returns_healthy(self): + response = self.client.get("/api/health") + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json(), {"status": "healthy"}) + + def test_ready_returns_503_when_processor_not_initialized(self): + response = self.client.get("/api/ready") + self.assertEqual(response.status_code, 503) + data = response.json() + self.assertEqual(data.get("status"), "not_ready") + self.assertIn("processor_not_initialized", data.get("issues", [])) + + def test_ready_returns_503_when_libreoffice_process_exited(self): + self.app.state.processor = DummyProcessor( + { + "9900": { + "process": DummySubprocess(pid=12345, returncode=1), + "pid": 12345, + "unhealthy": False, + } + } + ) + + response = self.client.get("/api/ready") + self.assertEqual(response.status_code, 503) + data = response.json() + self.assertEqual(data.get("status"), "not_ready") + self.assertIn("libreoffice_process_exited:9900", data.get("issues", [])) + + @patch("ocr_service.api.health.psutil.Process") + @patch("ocr_service.api.health.psutil.pid_exists", return_value=True) + def test_ready_returns_200_for_running_libreoffice_process(self, _pid_exists, process_mock): + process_mock.return_value = DummyPsutilProcess(running=True, process_status="sleeping") + self.app.state.processor = DummyProcessor( + { + "9900": { + "process": DummySubprocess(pid=12345, returncode=None), + "pid": 12345, + "unhealthy": False, + } + } + ) + + response = self.client.get("/api/ready") + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json(), {"status": "ready", "libreoffice_processes": 1}) + diff --git a/start_service_debug.sh b/start_service_debug.sh index 1c3fe95..6c6a674 100644 --- a/start_service_debug.sh +++ b/start_service_debug.sh @@ -16,7 +16,7 @@ export OCR_SERVICE_PORT="${OCR_SERVICE_PORT:-8090}" export OCR_SERVICE_WORKER_CLASS="${OCR_SERVICE_WORKER_CLASS:-sync}" export OCR_WEB_SERVICE_WORKERS="${OCR_WEB_SERVICE_WORKERS:-1}" export OCR_SERVICE_LOG_LEVEL="${OCR_SERVICE_LOG_LEVEL:-20}" -export OCR_SERVICE_GUNICORN_LOG_FILE_PATH="${OCR_SERVICE_GUNICORN_LOG_FILE_PATH:-./log/ocr_service.log}" +export OCR_SERVICE_GUNICORN_LOG_FILE_PATH="${OCR_SERVICE_GUNICORN_LOG_FILE_PATH:--}" export OCR_SERVICE_GUNICORN_LOG_LEVEL="${OCR_SERVICE_GUNICORN_LOG_LEVEL:-info}" export OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER="${OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER:-5000}" export OCR_SERVICE_GUNICORN_MAX_REQUESTS="${OCR_SERVICE_GUNICORN_MAX_REQUESTS:-50000}" @@ -52,13 +52,15 @@ else exit 1 fi -$python_version -m gunicorn wsgi:app --worker-class "$OCR_SERVICE_WORKER_CLASS" \ - --bind "$OCR_SERVICE_HOST:$OCR_SERVICE_PORT" \ - --threads "1" \ - --workers "$OCR_WEB_SERVICE_WORKERS" \ - --access-logfile "$OCR_SERVICE_GUNICORN_LOG_FILE_PATH" \ - --reload --log-level "debug" \ - --max-requests "$OCR_SERVICE_GUNICORN_MAX_REQUESTS" \ - --max-requests-jitter "$OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER" \ - --timeout "$OCR_SERVICE_GUNICORN_TIMEOUT" \ - --graceful-timeout "$OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT" +gunicorn_cmd=("$python_version" "-m" "gunicorn") + +exec "${gunicorn_cmd[@]}" wsgi:app --worker-class "$OCR_SERVICE_WORKER_CLASS" \ + --bind "$OCR_SERVICE_HOST:$OCR_SERVICE_PORT" \ + --threads "1" \ + --workers "$OCR_WEB_SERVICE_WORKERS" \ + --access-logfile "$OCR_SERVICE_GUNICORN_LOG_FILE_PATH" \ + --reload --log-level "debug" \ + --max-requests "$OCR_SERVICE_GUNICORN_MAX_REQUESTS" \ + --max-requests-jitter "$OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER" \ + --timeout "$OCR_SERVICE_GUNICORN_TIMEOUT" \ + --graceful-timeout "$OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT" diff --git a/start_service_production.sh b/start_service_production.sh index 04a6d6a..ca1fdf1 100644 --- a/start_service_production.sh +++ b/start_service_production.sh @@ -7,7 +7,7 @@ export OCR_SERVICE_PORT="${OCR_SERVICE_PORT:-8090}" export OCR_SERVICE_WORKER_CLASS="${OCR_SERVICE_WORKER_CLASS:-sync}" export OCR_WEB_SERVICE_WORKERS="${OCR_WEB_SERVICE_WORKERS:-1}" export OCR_SERVICE_LOG_LEVEL="${OCR_SERVICE_LOG_LEVEL:-20}" -export OCR_SERVICE_GUNICORN_LOG_FILE_PATH="${OCR_SERVICE_GUNICORN_LOG_FILE_PATH:-./log/ocr_service.log}" +export OCR_SERVICE_GUNICORN_LOG_FILE_PATH="${OCR_SERVICE_GUNICORN_LOG_FILE_PATH:--}" export OCR_SERVICE_GUNICORN_LOG_LEVEL="${OCR_SERVICE_GUNICORN_LOG_LEVEL:-info}" export OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER="${OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER:-5000}" export OCR_SERVICE_GUNICORN_MAX_REQUESTS="${OCR_SERVICE_GUNICORN_MAX_REQUESTS:-50000}" @@ -36,7 +36,7 @@ if [[ -x "$VIRTUAL_ENV/bin/gunicorn" ]]; then export VIRTUAL_ENV export PATH="$VIRTUAL_ENV/bin:$PATH" python_cmd="$VIRTUAL_ENV/bin/python" - gunicorn_cmd="$VIRTUAL_ENV/bin/gunicorn" + gunicorn_cmd=("$VIRTUAL_ENV/bin/gunicorn") else # Fallback to system python if venv missing python_cmd=python3 @@ -47,16 +47,16 @@ else elif command -v python3.13 &>/dev/null; then python_cmd=python3.13 fi - gunicorn_cmd="$python_cmd -m gunicorn" + gunicorn_cmd=("$python_cmd" "-m" "gunicorn") fi -$gunicorn_cmd wsgi:app --worker-class "$OCR_SERVICE_WORKER_CLASS" \ - --bind "$OCR_SERVICE_HOST:$OCR_SERVICE_PORT" \ - --threads "1" \ - --workers "$OCR_WEB_SERVICE_WORKERS" \ - --access-logfile "$OCR_SERVICE_GUNICORN_LOG_FILE_PATH" \ - --log-level "$OCR_SERVICE_GUNICORN_LOG_LEVEL" \ - --max-requests "$OCR_SERVICE_GUNICORN_MAX_REQUESTS" \ - --max-requests-jitter "$OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER" \ - --timeout "$OCR_SERVICE_GUNICORN_TIMEOUT" \ - --graceful-timeout "$OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT" +exec "${gunicorn_cmd[@]}" wsgi:app --worker-class "$OCR_SERVICE_WORKER_CLASS" \ + --bind "$OCR_SERVICE_HOST:$OCR_SERVICE_PORT" \ + --threads "1" \ + --workers "$OCR_WEB_SERVICE_WORKERS" \ + --access-logfile "$OCR_SERVICE_GUNICORN_LOG_FILE_PATH" \ + --log-level "$OCR_SERVICE_GUNICORN_LOG_LEVEL" \ + --max-requests "$OCR_SERVICE_GUNICORN_MAX_REQUESTS" \ + --max-requests-jitter "$OCR_SERVICE_GUNICORN_MAX_REQUESTS_JITTER" \ + --timeout "$OCR_SERVICE_GUNICORN_TIMEOUT" \ + --graceful-timeout "$OCR_SERVICE_GUNICORN_GRACEFUL_TIMEOUT"