diff --git a/.codacy.yml b/.codacy.yml new file mode 100644 index 0000000000..626bab79e8 --- /dev/null +++ b/.codacy.yml @@ -0,0 +1,15 @@ +--- +# Codacy configuration file +# https://docs.codacy.com/repositories-configure/codacy-configuration-file/ +# Analyze only doctr/ and references/ — everything else is excluded. + +exclude_paths: + - ".github/**" + - "api/**" + - "demo/**" + - "docs/**" + - "notebooks/**" + - "scripts/**" + - "tests/**" + # Root-level files (setup.py, pyproject.toml, README.md, ...) + - "*.*" diff --git a/doctr/utils/fonts.py b/doctr/utils/fonts.py index c67ac6b363..ac93a707e8 100644 --- a/doctr/utils/fonts.py +++ b/doctr/utils/fonts.py @@ -5,34 +5,71 @@ import logging import platform +from functools import lru_cache from PIL import ImageFont __all__ = ["get_font"] +_FONT_CANDIDATES: dict[str, tuple[str, ...]] = { + "Linux": ( + "DejaVuSans.ttf", + "NotoSans-Regular.ttf", + "LiberationSans-Regular.ttf", + "FreeSans.ttf", + "FreeMono.ttf", # legacy default + ), + "Darwin": ( + "Arial Unicode.ttf", + "Helvetica.ttc", + "Arial.ttf", # legacy default + ), + "Windows": ( + "arial.ttf", # legacy default + "segoeui.ttf", + "tahoma.ttf", + ), +} + + +@lru_cache(maxsize=1) +def _resolve_default_font_family() -> str | None: + """Find the first available candidate font for this platform.""" + candidates = _FONT_CANDIDATES.get(platform.system(), _FONT_CANDIDATES["Linux"]) + for family in candidates: + try: + ImageFont.truetype(family, 10) + return family + except OSError: + continue + return None + def get_font(font_family: str | None = None, font_size: int = 13) -> ImageFont.FreeTypeFont | ImageFont.ImageFont: """Resolves a compatible ImageFont for the system Args: - font_family: the font family to use + font_family: the font family (or path to a font file) to use. If None, + the best available system font is picked automatically. font_size: the size of the font upon rendering Returns: the Pillow font """ - # Font selection - if font_family is None: - try: - font = ImageFont.truetype("FreeMono.ttf" if platform.system() == "Linux" else "Arial.ttf", font_size) - except OSError: # pragma: no cover - font = ImageFont.load_default() # type: ignore[assignment] - logging.warning( - "unable to load recommended font family. Loading default PIL font," - "font size issues may be expected." - "To prevent this, it is recommended to specify the value of 'font_family'." - ) - else: # pragma: no cover - font = ImageFont.truetype(font_family, font_size) - - return font + if font_family is not None: + return ImageFont.truetype(font_family, font_size) + + default_family = _resolve_default_font_family() + if default_family is not None: + return ImageFont.truetype(default_family, font_size) + + # Last resort: Pillow's built-in font. + try: + return ImageFont.load_default(size=font_size) + except TypeError: # pragma: no cover + logging.warning( + "Unable to load any recommended font family. Loading default PIL font, " + "font size issues may be expected. " + "To prevent this, it is recommended to specify the value of 'font_family'." + ) + return ImageFont.load_default() diff --git a/doctr/utils/reconstitution.py b/doctr/utils/reconstitution.py index 6fad671867..b5afcc6210 100644 --- a/doctr/utils/reconstitution.py +++ b/doctr/utils/reconstitution.py @@ -3,26 +3,126 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. import logging +import math +from functools import lru_cache from typing import Any import numpy as np from anyascii import anyascii -from PIL import Image, ImageDraw +from PIL import Image, ImageDraw, ImageFont from .fonts import get_font __all__ = ["synthesize_page", "synthesize_kie_page"] -# Global variable to avoid multiple warnings -ROTATION_WARNING = False +@lru_cache(maxsize=256) +def _cached_font(font_family: str | None, font_size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont: + """Memoized font loader: avoids re-reading the font file for every word.""" + return get_font(font_family, font_size) -def _warn_rotation(entry: dict[str, Any]) -> None: # pragma: no cover - global ROTATION_WARNING - if not ROTATION_WARNING and len(entry["geometry"]) == 4: - logging.warning("Polygons with larger rotations will lead to inaccurate rendering") - ROTATION_WARNING = True +@lru_cache(maxsize=1) +def _warn_rotation_once() -> None: # pragma: no cover + # lru_cache gives us thread-safe "warn once" semantics without a mutable global + logging.warning("Polygons with larger rotations may lead to slightly inaccurate rendering") + + +def _polygon_angle(polygon: list[tuple[float, float]], w: int, h: int) -> float: + """Estimate the rotation angle (degrees, counter-clockwise) from the top edge of a 4-point polygon.""" + (x0, y0), (x1, y1) = polygon[0], polygon[1] + return -math.degrees(math.atan2((y1 - y0) * h, (x1 - x0) * w)) + + +def _text_width(font: ImageFont.FreeTypeFont | ImageFont.ImageFont, text: str) -> int: + bbox = font.getbbox(text) + return max(int(bbox[2]) - int(bbox[0]), 1) + + +def _fit_font( + text: str, + box_w: int, + box_h: int, + font_family: str | None, + min_font_size: int, + max_font_size: int, +) -> ImageFont.FreeTypeFont | ImageFont.ImageFont: + """Directly estimate the largest font size fitting the box (text width scales ~linearly with size).""" + font_size = max(min(box_h, max_font_size), min_font_size) + try: + font = _cached_font(font_family, font_size) + x0, y0, x1, y1 = font.getbbox(text) + text_w, text_h = max(int(x1) - int(x0), 1), max(int(y1) - int(y0), 1) + if text_w > box_w or text_h > box_h: + scale = min(box_w / text_w, box_h / text_h) + font_size = max(min(int(font_size * scale), max_font_size), min_font_size) + font = _cached_font(font_family, font_size) + # The linear estimate can be off by a pixel or two: shrink until the text truly fits + while font_size > min_font_size and _text_width(font, text) > box_w: + font_size -= 1 + font = _cached_font(font_family, font_size) + except ValueError: # pragma: no cover + font = _cached_font(font_family, min_font_size) + return font + + +def _fit_line_font( + word_widths: list[tuple[str, int]], + line_height: int, + font_family: str | None, + min_font_size: int, + max_font_size: int, +) -> ImageFont.FreeTypeFont | ImageFont.ImageFont: + """Find one font size for a whole line such that every word fits its own available width.""" + font_size = max(min(line_height, max_font_size), min_font_size) + try: + font = _cached_font(font_family, font_size) + # Scale down so the most constrained word still fits its own box (linear estimate) + scale = min([avail_w / _text_width(font, value) for value, avail_w in word_widths] + [1.0]) + if scale < 1.0: + font_size = max(min(int(font_size * scale), max_font_size), min_font_size) + font = _cached_font(font_family, font_size) + # The linear estimate can be off by a pixel or two: shrink until every word truly fits + while font_size > min_font_size and any(_text_width(font, value) > avail_w for value, avail_w in word_widths): + font_size -= 1 + font = _cached_font(font_family, font_size) + except ValueError: # pragma: no cover + font = _cached_font(font_family, min_font_size) + return font + + +def _draw_word( + d: ImageDraw.ImageDraw, + xy: tuple[int, int], + text: str, + font: ImageFont.FreeTypeFont | ImageFont.ImageFont, + fill: tuple[int, int, int], + anchor: str = "lm", +) -> None: + try: + try: + d.text(xy, text, font=font, fill=fill, anchor=anchor) + except UnicodeEncodeError: + d.text(xy, anyascii(text), font=font, fill=fill, anchor=anchor) + except Exception: # pragma: no cover + logging.warning(f"Could not render word: {text}") + + +def _paste_rotated_word( + response: Image.Image, + text: str, + font: ImageFont.FreeTypeFont | ImageFont.ImageFont, + center: tuple[int, int], + angle: float, + fill: tuple[int, int, int], +) -> None: + """Render a word on a transparent patch, rotate it, and paste it centered on the polygon centroid.""" + bbox = font.getbbox(text) + x0, y0, x1, y1 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) + patch = Image.new("RGBA", (max(x1 - x0, 1) + 4, max(y1 - y0, 1) + 4), (0, 0, 0, 0)) + _draw_word(ImageDraw.Draw(patch), (2 - x0, 2 - y0), text, font, fill, anchor="la") + patch = patch.rotate(angle, expand=True, resample=Image.Resampling.BICUBIC) + response.paste(patch, (center[0] - patch.width // 2, center[1] - patch.height // 2), patch) def _synthesize( @@ -32,17 +132,19 @@ def _synthesize( h: int, draw_proba: bool = False, font_family: str | None = None, - smoothing_factor: float = 0.75, min_font_size: int = 6, max_font_size: int = 50, + text_color: tuple[int, int, int] = (0, 0, 0), ) -> Image.Image: if len(entry["geometry"]) == 2: - (xmin, ymin), (xmax, ymax) = entry["geometry"] - polygon = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)] + (xmin_r, ymin_r), (xmax_r, ymax_r) = entry["geometry"] + polygon = [(xmin_r, ymin_r), (xmax_r, ymin_r), (xmax_r, ymax_r), (xmin_r, ymax_r)] + angle = 0.0 else: polygon = entry["geometry"] + angle = _polygon_angle(polygon, w, h) - # Calculate the bounding box of the word + # Calculate the bounding box of the entry x_coords, y_coords = zip(*polygon) xmin, ymin, xmax, ymax = ( int(round(w * min(x_coords))), @@ -50,61 +152,82 @@ def _synthesize( int(round(w * max(x_coords))), int(round(h * max(y_coords))), ) - word_width = xmax - xmin - word_height = ymax - ymin + box_width, box_height = max(xmax - xmin, 1), max(ymax - ymin, 1) + + d = ImageDraw.Draw(response) - # If lines are provided instead of words, concatenate the word entries if "words" in entry: - word_text = " ".join(word["value"] for word in entry["words"]) + # Line entry: one consistent font size for the whole line, drawn word by word. + word_render: list[tuple[str, int, int, int, int, float]] = [] + for word in entry["words"]: + geom = word["geometry"] + if len(geom) == 2: + (gx0, gy0), (gx1, gy1) = geom + wxmin, wymin = int(round(w * gx0)), int(round(h * gy0)) + wxmax, wymax = int(round(w * gx1)), int(round(h * gy1)) + word_render.append(( + word["value"], + wxmin, + (wymin + wymax) // 2, + max(wxmax - wxmin, 1), + max(wymax - wymin, 1), + 0.0, + )) + else: + xs, ys = zip(*geom) + cx = int(round(w * sum(xs) / len(xs))) + cy = int(round(h * sum(ys) / len(ys))) + # True text-direction extent: length of the top edge / left edge in pixels + avail_w = int(round(math.hypot((geom[1][0] - geom[0][0]) * w, (geom[1][1] - geom[0][1]) * h))) + avail_h = int(round(math.hypot((geom[2][0] - geom[1][0]) * w, (geom[2][1] - geom[1][1]) * h))) + word_render.append(( + word["value"], + cx, + cy, + max(avail_w, 1), + max(avail_h, 1), + _polygon_angle(geom, w, h), + )) + line_height = min(avail_h for *_, avail_h, _angle in word_render) + font = _fit_line_font( + [(value, avail_w) for value, _, _, avail_w, _, _ in word_render], + line_height, + font_family, + min_font_size, + max_font_size, + ) + for value, ax, ay, _, _, word_angle in word_render: + if abs(word_angle) > 3: + _paste_rotated_word(response, value, font, (ax, ay), word_angle, text_color) + else: + _draw_word(d, (ax, ay), value, font, text_color, anchor="lm") else: word_text = entry["value"] - # Find the optimal font size - try: - font_size = min(word_height, max_font_size) - font = get_font(font_family, font_size) - text_width, text_height = font.getbbox(word_text)[2:4] - - while (text_width > word_width or text_height > word_height) and font_size > min_font_size: - font_size = max(int(font_size * smoothing_factor), min_font_size) - font = get_font(font_family, font_size) - text_width, text_height = font.getbbox(word_text)[2:4] - except ValueError: - font = get_font(font_family, min_font_size) - - # Create a mask for the word - mask = Image.new("L", (w, h), 0) - ImageDraw.Draw(mask).polygon([(int(round(w * x)), int(round(h * y))) for x, y in polygon], fill=255) - - # Draw the word text - d = ImageDraw.Draw(response) - try: - try: - d.text((xmin, ymin), word_text, font=font, fill=(0, 0, 0), anchor="lt") - except UnicodeEncodeError: - d.text((xmin, ymin), anyascii(word_text), font=font, fill=(0, 0, 0), anchor="lt") - # Catch generic exceptions to avoid crashing the whole rendering - except Exception: # pragma: no cover - logging.warning(f"Could not render word: {word_text}") + if abs(angle) > 3: # Rotated word: render on a patch and paste it rotated + font = _fit_font(word_text, box_width, box_height, font_family, min_font_size, max_font_size) + cx, cy = int(round(w * sum(x_coords) / len(x_coords))), int(round(h * sum(y_coords) / len(y_coords))) + _paste_rotated_word(response, word_text, font, (cx, cy), angle, text_color) + else: + font = _fit_font(word_text, box_width, box_height, font_family, min_font_size, max_font_size) + # "lm" anchor: vertically centered in the box, no ascender-offset drift + _draw_word(d, (xmin, (ymin + ymax) // 2), word_text, font, text_color, anchor="lm") if draw_proba: confidence = ( entry["confidence"] if "confidence" in entry - else sum(w["confidence"] for w in entry["words"]) / len(entry["words"]) + else sum(word["confidence"] for word in entry["words"]) / len(entry["words"]) ) p = int(255 * confidence) color = (255 - p, 0, p) # Red to blue gradient based on probability d.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=2) - prob_font = get_font(font_family, 20) + # Scale the confidence label with the box instead of a hardcoded size + prob_font = _cached_font(font_family, max(min(box_height // 2, 20), 10)) prob_text = f"{confidence:.2f}" prob_text_width, prob_text_height = prob_font.getbbox(prob_text)[2:4] - - # Position the probability slightly above the bounding box - prob_x_offset = (word_width - prob_text_width) // 2 - prob_y_offset = ymin - prob_text_height - 2 - prob_y_offset = max(0, prob_y_offset) - + prob_x_offset = (box_width - prob_text_width) // 2 + prob_y_offset = max(0, ymin - prob_text_height - 2) d.text((xmin + prob_x_offset, prob_y_offset), prob_text, font=prob_font, fill=color, anchor="lt") return response @@ -114,59 +237,44 @@ def synthesize_page( page: dict[str, Any], draw_proba: bool = False, font_family: str | None = None, - smoothing_factor: float = 0.95, min_font_size: int = 8, max_font_size: int = 50, + background_color: tuple[int, int, int] = (255, 255, 255), + text_color: tuple[int, int, int] = (0, 0, 0), ) -> np.ndarray: - """Draw a the content of the element page (OCR response) on a blank page. + """Draw the content of the element page (OCR response) on a blank page. Args: page: exported Page object to represent draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 font_family: family of the font - smoothing_factor: factor to smooth the font size min_font_size: minimum font size max_font_size: maximum font size + background_color: RGB color of the page background + text_color: RGB color of the rendered text Returns: the synthesized page """ - # Draw template h, w = page["dimensions"] - response = Image.new("RGB", (w, h), color=(255, 255, 255)) + response = Image.new("RGB", (w, h), color=background_color) for block in page["blocks"]: - # If lines are provided use these to get better rendering results - if len(block["lines"]) > 1: - for line in block["lines"]: - _warn_rotation(block) # pragma: no cover - response = _synthesize( - response=response, - entry=line, - w=w, - h=h, - draw_proba=draw_proba, - font_family=font_family, - smoothing_factor=smoothing_factor, - min_font_size=min_font_size, - max_font_size=max_font_size, - ) - # Otherwise, draw each word - else: - for line in block["lines"]: - _warn_rotation(block) # pragma: no cover - for word in line["words"]: - response = _synthesize( - response=response, - entry=word, - w=w, - h=h, - draw_proba=draw_proba, - font_family=font_family, - smoothing_factor=smoothing_factor, - min_font_size=min_font_size, - max_font_size=max_font_size, - ) + for line in block["lines"]: + if len(line["geometry"]) == 4: + _warn_rotation_once() # pragma: no cover + # Line-level entry keeps a consistent font per line while preserving word positions + response = _synthesize( + response=response, + entry=line, + w=w, + h=h, + draw_proba=draw_proba, + font_family=font_family, + min_font_size=min_font_size, + max_font_size=max_font_size, + text_color=text_color, + ) return np.array(response, dtype=np.uint8) @@ -175,28 +283,32 @@ def synthesize_kie_page( page: dict[str, Any], draw_proba: bool = False, font_family: str | None = None, + min_font_size: int = 8, + max_font_size: int = 50, + background_color: tuple[int, int, int] = (255, 255, 255), + text_color: tuple[int, int, int] = (0, 0, 0), ) -> np.ndarray: - """Draw a the content of the element page (OCR response) on a blank page. + """Draw the content of the element page (KIE OCR response) on a blank page. Args: page: exported Page object to represent draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 font_family: family of the font - smoothing_factor: factor to smooth the font size min_font_size: minimum font size max_font_size: maximum font size + background_color: RGB color of the page background + text_color: RGB color of the rendered text Returns: the synthesized page """ - # Draw template h, w = page["dimensions"] - response = Image.new("RGB", (w, h), color=(255, 255, 255)) + response = Image.new("RGB", (w, h), color=background_color) - # Draw each word for predictions in page["predictions"].values(): for prediction in predictions: - _warn_rotation(prediction) # pragma: no cover + if len(prediction["geometry"]) == 4: + _warn_rotation_once() # pragma: no cover response = _synthesize( response=response, entry=prediction, @@ -204,5 +316,8 @@ def synthesize_kie_page( h=h, draw_proba=draw_proba, font_family=font_family, + min_font_size=min_font_size, + max_font_size=max_font_size, + text_color=text_color, ) return np.array(response, dtype=np.uint8) diff --git a/pyproject.toml b/pyproject.toml index e67d4eb84b..159086f8fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ dependencies = [ "langdetect>=1.0.9,<2.0.0", "rapidfuzz>=3.0.0,<4.0.0", "huggingface-hub>=0.20.0,<2.0.0", - "Pillow>=9.2.0", + "Pillow>=10.1.0", "anyascii>=0.3.2", "validators>=0.18.0", "tqdm>=4.30.0", diff --git a/tests/common/test_utils_fonts.py b/tests/common/test_utils_fonts.py index cd5d0576e4..056de475e5 100644 --- a/tests/common/test_utils_fonts.py +++ b/tests/common/test_utils_fonts.py @@ -1,10 +1,67 @@ +import pytest from PIL.ImageFont import FreeTypeFont, ImageFont +from doctr.utils import fonts from doctr.utils.fonts import get_font -def test_get_font(): +@pytest.fixture(autouse=True) +def _reset_font_cache(): + # Ensure each test starts with a fresh font resolution cache + fonts._resolve_default_font_family.cache_clear() + yield + fonts._resolve_default_font_family.cache_clear() + + +def test_get_font_default(): # Attempts to load recommended OS font font = get_font() assert isinstance(font, (ImageFont, FreeTypeFont)) + # The font must be able to measure text + x0, y0, x1, y1 = font.getbbox("hello") + assert x1 > x0 and y1 > y0 + + +def test_get_font_respects_size(): + font = get_font(font_size=32) + # Both system fonts and Pillow >= 10.1 scalable default expose `size` + if hasattr(font, "size"): + assert font.size == 32 + + +def test_get_font_explicit_family(): + # An explicitly requested font that exists should load + default_family = fonts._resolve_default_font_family() + if default_family is not None: + font = get_font(default_family, 16) + assert isinstance(font, FreeTypeFont) + assert font.size == 16 + + # An explicitly requested font that does not exist should fail loudly + with pytest.raises(OSError): + get_font("this-font-does-not-exist.ttf") + + +def test_get_font_resolution_is_cached(): + get_font() + info_after_first = fonts._resolve_default_font_family.cache_info() + get_font() + get_font(font_size=24) + info_after_more = fonts._resolve_default_font_family.cache_info() + + # The filesystem probing must run at most once per process + assert info_after_first.misses == 1 + assert info_after_more.misses == 1 + assert info_after_more.hits >= info_after_first.hits + 2 + + +def test_get_font_fallback(monkeypatch): + # Force every candidate to be unavailable so the built-in fallback is exercised + monkeypatch.setattr(fonts, "_FONT_CANDIDATES", dict.fromkeys(fonts._FONT_CANDIDATES, ("missing-font.ttf",))) + fonts._resolve_default_font_family.cache_clear() + + font = get_font(font_size=20) + assert isinstance(font, (ImageFont, FreeTypeFont)) + # The fallback font must still be usable for text measurement + assert font.getbbox("hello")[2] > 0 diff --git a/tests/common/test_utils_reconstitution.py b/tests/common/test_utils_reconstitution.py index be98db89b2..3632870998 100644 --- a/tests/common/test_utils_reconstitution.py +++ b/tests/common/test_utils_reconstitution.py @@ -4,41 +4,173 @@ from doctr.utils import reconstitution +def _assert_valid_render(render: np.ndarray, dimensions: tuple[int, int]) -> None: + assert isinstance(render, np.ndarray) + assert render.dtype == np.uint8 + assert render.shape == (*dimensions, 3) + # Something must actually have been drawn on the page + assert (render < 255).any() + + def test_synthesize_page(): pages = _mock_pages() # Test without probability rendering render_no_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=False) - assert isinstance(render_no_proba, np.ndarray) - assert render_no_proba.shape == (*pages[0].dimensions, 3) + _assert_valid_render(render_no_proba, pages[0].dimensions) + # Text is drawn in black on white: the render must stay grayscale + assert (render_no_proba[..., 0] == render_no_proba[..., 2]).all() # Test with probability rendering render_with_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=True) - assert isinstance(render_with_proba, np.ndarray) - assert render_with_proba.shape == (*pages[0].dimensions, 3) + _assert_valid_render(render_with_proba, pages[0].dimensions) + # Confidence boxes are colored (red-to-blue gradient), so R and B must differ somewhere + assert (render_with_proba[..., 0] != render_with_proba[..., 2]).any() # Test with only one line pages_one_line = pages[0].export() pages_one_line["blocks"][0]["lines"] = [pages_one_line["blocks"][0]["lines"][0]] render_one_line = reconstitution.synthesize_page(pages_one_line, draw_proba=True) - assert isinstance(render_one_line, np.ndarray) - assert render_one_line.shape == (*pages[0].dimensions, 3) + _assert_valid_render(render_one_line, pages[0].dimensions) # Test with polygons pages_poly = pages[0].export() pages_poly["blocks"][0]["lines"][0]["geometry"] = [(0, 0), (0, 1), (1, 1), (1, 0)] render_poly = reconstitution.synthesize_page(pages_poly, draw_proba=True) - assert isinstance(render_poly, np.ndarray) - assert render_poly.shape == (*pages[0].dimensions, 3) + _assert_valid_render(render_poly, pages[0].dimensions) + + +def test_synthesize_page_colors(): + page = _mock_pages()[0].export() + + # Custom text color + render = reconstitution.synthesize_page(page, text_color=(255, 0, 0)) + assert ((render[..., 0] > 200) & (render[..., 1] < 100) & (render[..., 2] < 100)).any() + + # Custom background color + render = reconstitution.synthesize_page(page, background_color=(0, 0, 0), text_color=(255, 255, 255)) + # Corners are part of the background + assert (render[0, 0] == 0).all() + assert (render > 128).any() + + +def test_synthesize_page_font_size_bounds(): + page = _mock_pages()[0].export() + render = reconstitution.synthesize_page(page, min_font_size=10, max_font_size=12) + _assert_valid_render(render, (300, 200)) + + +def test_synthesize_page_unicode(): + # Non-Latin text must render without raising (wide-coverage default font) + page = _mock_pages()[0].export() + page["blocks"][0]["lines"][0]["words"][0]["value"] = "Привет" + page["blocks"][0]["lines"][0]["words"][1]["value"] = "Ελληνικά" + render = reconstitution.synthesize_page(page) + _assert_valid_render(render, (300, 200)) def test_synthesize_kie_page(): pages = _mock_kie_pages() # Test without probability rendering render_no_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=False) - assert isinstance(render_no_proba, np.ndarray) - assert render_no_proba.shape == (*pages[0].dimensions, 3) + _assert_valid_render(render_no_proba, pages[0].dimensions) # Test with probability rendering render_with_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=True) - assert isinstance(render_with_proba, np.ndarray) - assert render_with_proba.shape == (*pages[0].dimensions, 3) + _assert_valid_render(render_with_proba, pages[0].dimensions) + + # Font size bounds are now part of the public signature (previously documented but missing) + render_sized = reconstitution.synthesize_kie_page(pages[0].export(), min_font_size=10, max_font_size=20) + _assert_valid_render(render_sized, pages[0].dimensions) + + +def test_synthesize_kie_page_rotated_prediction(caplog): + page = _mock_kie_pages()[0].export() + class_name = next(iter(page["predictions"])) + # Replace the first prediction geometry with a ~17 degree rotated polygon + page["predictions"][class_name][0]["geometry"] = [(0.2, 0.20), (0.6, 0.28), (0.58, 0.38), (0.18, 0.30)] + + reconstitution._warn_rotation_once.cache_clear() + render = reconstitution.synthesize_kie_page(page, draw_proba=True) + _assert_valid_render(render, (300, 200)) + + # The rotation warning must be emitted once, and only once, per process + reconstitution._warn_rotation_once.cache_clear() + caplog.clear() + with caplog.at_level("WARNING"): + reconstitution.synthesize_kie_page(page) + reconstitution.synthesize_kie_page(page) + warnings = [record for record in caplog.records if "rotation" in record.message.lower()] + assert len(warnings) == 1 + + +def test_synthesize_page_words_do_not_overlap(): + # Two adjacent words whose boxes are much narrower than the naive line-level font + # would require: the render must keep the gap between the boxes blank (regression test) + page = { + "dimensions": (300, 400), + "blocks": [ + { + "geometry": ((0.05, 0.4), (0.5, 0.48)), + "lines": [ + { + "geometry": ((0.05, 0.4), (0.5, 0.48)), + "words": [ + {"value": "Wideword", "confidence": 0.9, "geometry": ((0.05, 0.4), (0.3, 0.48))}, + {"value": "Next", "confidence": 0.9, "geometry": ((0.32, 0.4), (0.5, 0.48))}, + ], + } + ], + } + ], + } + render = reconstitution.synthesize_page(page) + _assert_valid_render(render, (300, 400)) + + # The vertical strip between the first word's box and the second word's box must be blank + gap = render[:, int(round(400 * 0.3)) + 1 : int(round(400 * 0.32)) - 1] + assert (gap == 255).all() + + +def test_synthesize_page_rotated_line(): + # A line whose words carry rotated 4-point polygons must render without error, + # follow the rotation (ink appears along the tilted baseline, not just the top band), + # and keep adjacent words from overlapping + import math + + h_px, w_px = 400, 600 + angle = math.radians(-18) + dx, dy = math.cos(angle), math.sin(angle) + px, py = -math.sin(angle), math.cos(angle) + height = 30 + + def rot_word(value, start_x, start_y, width): + x0, y0 = start_x, start_y + x1, y1 = x0 + width * dx, y0 + width * dy + x2, y2 = x1 + height * px, y1 + height * py + x3, y3 = x0 + height * px, y0 + height * py + poly = [(x / w_px, y / h_px) for x, y in ((x0, y0), (x1, y1), (x2, y2), (x3, y3))] + return {"value": value, "confidence": 0.9, "geometry": poly}, (x1, y1) + + w1, end1 = rot_word("Rotated", 60, 220, 150) + w2, _ = rot_word("baseline", end1[0] + 20 * dx, end1[1] + 20 * dy, 160) + page = { + "dimensions": (h_px, w_px), + "blocks": [ + { + "geometry": ((0, 0), (1, 1)), + "lines": [ + { + "geometry": [w1["geometry"][0], w2["geometry"][1], w2["geometry"][2], w1["geometry"][3]], + "words": [w1, w2], + }, + ], + } + ], + } + render = reconstitution.synthesize_page(page) + _assert_valid_render(render, (h_px, w_px)) + + # With an upward tilt, the second word's ink must sit clearly above the first word's start row; + # a horizontal per-bbox render would not place ink that high at those x-positions + right_half = render[: 220 - 2 * height, w_px // 2 :] + assert (right_half < 128).any()