mindee · felixdittrich92 · May 22, 2026 · May 13, 2026 · May 13, 2026 · May 21, 2026
diff --git a/.github/workflows/references.yml b/.github/workflows/references.yml
@@ -251,3 +251,96 @@ jobs:
           pip install -e .[viz,html] --upgrade
       - name: Benchmark latency
         run: python references/detection/latency.py db_mobilenet_v3_large --it 5 --size 512
+
+
+  train-layout-analysis:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python: ["3.10"]
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.python }}
+          architecture: x64
+      - name: Cache python modules
+        uses: actions/cache@v5
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-${{ hashFiles('references/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .[viz,html] --upgrade
+          pip install -r references/requirements.txt
+      - name: Download and extract toy set
+        run: |
+          wget https://github.com/mindee/doctr/releases/download/v1.0.1/toy_layout_set-d4a8d4c9.zip
+          sudo apt-get update && sudo apt-get install unzip -y
+          unzip toy_layout_set-d4a8d4c9.zip -d layout_set
+      - name: Train for a short epoch
+        run: python references/layout/train.py lw_detr_s --train_path ./layout_set --val_path ./layout_set -b 2 --epochs 1
+
+  evaluate-layout-analysis:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python: ["3.10"]
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.python }}
+          architecture: x64
+      - name: Cache python modules
+        uses: actions/cache@v5
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .[viz,html] --upgrade
+          pip install -r references/requirements.txt
+      - name: Download and extract toy set
+        run: |
+          wget https://github.com/mindee/doctr/releases/download/v1.0.1/toy_layout_set-d4a8d4c9.zip
+          sudo apt-get update && sudo apt-get install unzip -y
+          unzip toy_layout_set-d4a8d4c9.zip -d layout_set
+      - name: Evaluate layout analysis
+        run: python references/layout/evaluate.py lw_detr_s ./layout_set
+
+  latency-layout-analysis:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python: ["3.10"]
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.python }}
+          architecture: x64
+      - name: Cache python modules
+        uses: actions/cache@v5
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .[viz,html] --upgrade
+      - name: Benchmark latency
+        run: python references/layout/latency.py lw_detr_s --it 5 --size 512
diff --git a/docs/source/using_doctr/custom_models_training.rst b/docs/source/using_doctr/custom_models_training.rst
@@ -6,6 +6,7 @@ For details on the training process and the necessary data and data format, refe
 
 - `detection <https://github.com/mindee/doctr/tree/main/references/detection#readme>`_
 - `recognition <https://github.com/mindee/doctr/tree/main/references/recognition#readme>`_
+- `layout <https://github.com/mindee/doctr/tree/main/references/layout#readme>`_
 
 If you’re looking for a lightweight yet efficient tool to annotate small amounts of data, especially tailored for docTR,
 check out the `docTR Labeling Tool <https://github.com/text2knowledge/docTR-Labeler>`_.
@@ -52,6 +53,20 @@ Load a custom recognition model trained on another vocabulary as the default one
 
     predictor = ocr_predictor(det_arch='linknet_resnet18', reco_arch=reco_model, pretrained=True)
 
+
+Load a custom layout analysis model trained on another set of classes as the default one:
+
+.. code:: python3
+
+    import torch
+    from doctr.models import layout_predictor, lw_detr_s
+    from doctr.datasets import VOCABS
+
+    layout_model = lw_detr_s(pretrained=False, class_names=["class_name_1", "class_name_2", ...])
+    layout_model.from_pretrained('<path_to_pt>')
+
+    predictor = layout_predictor(layout_arch=layout_model, pretrained=True)
+
 Load a custom trained KIE detection model:
 
 .. code:: python3

diff --git a/docs/source/using_doctr/using_models.rst b/docs/source/using_doctr/using_models.rst
@@ -174,6 +174,66 @@ Recognition predictors
     out = model([dummy_img])
 
 
+Layout Analysis
+---------------
+
+The task consists of localizing and classifying visual elements in a given image.
+This is a more general task than text detection, as it can be used to detect and classify any type of visual element in a document, such as tables, figures, headers, footers, etc.
+Our latest layout models works with rotated and skewed documents!
+
+Available architectures
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The following architectures are currently supported:
+
+* :py:meth:`lw_detr_s <doctr.models.layout.lw_detr_s>`
+* :py:meth:`lw_detr_m <doctr.models.layout.lw_detr_m>`
+
+For a comprehensive comparison, we have compiled a detailed benchmark:
+
++--------------------------------------------------+-----------------+---------------+------------------+-------------+--------------+--------------------+
+|                                                  |                 |               |                  |             |              |                    |
++==================================================+=================+===============+==================+=============+==============+====================+
+| **Architecture**                                 | **Input shape** | **# params**  | **mAP@[.5:.95]** | **AP@[.5]** | **AP@[.75]** | **sec/it (B: 1)**  |
++--------------------------------------------------+-----------------+---------------+------------------+-------------+--------------+--------------------+
+| lw_detr_s                                        | (1024, 1024, 3) | 15.1 M        |                  |             |              | 0.5                |
++--------------------------------------------------+-----------------+---------------+------------------+-------------+--------------+--------------------+
+| lw_detr_m                                        | (1024, 1024, 3) | 29.5 M        |                  |             |              | 0.7                |
++--------------------------------------------------+-----------------+---------------+------------------+-------------+--------------+--------------------+
+
+
+Explanations about the metrics being used are available in :ref:`metrics`.
+
+Seconds per iteration (with a batch size of 1) is computed after a warmup phase of 100 tensors, by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz`.
+
+
+Layout predictors
+^^^^^^^^^^^^^^^^^
+
+:py:meth:`layout_predictor <doctr.models.layout.layout_predictor>` wraps your layout model to make it easily useable with your favorite deep learning framework seamlessly.
+
+.. code:: python3
+
+    import numpy as np
+    from doctr.models import layout_predictor
+    model = layout_predictor('lw_detr_s')
+    dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
+    out = model([dummy_img])
+
+You can pass specific boolean arguments to the predictor:
+* `pretrained`: if you want to use a model that has been pretrained on a specific dataset, setting `pretrained=True` this will load the corresponding weights. If `pretrained=False`, which is the default, would otherwise lead to a random initialization and would lead to no/useless results.
+* `assume_straight_pages`: if you work with straight documents only, it will fit straight bounding boxes to the text areas.
+* `preserve_aspect_ratio`: if you want to preserve the aspect ratio of your documents while resizing before sending them to the model.
+* `symmetric_pad`: if you choose to preserve the aspect ratio, it will pad the image symmetrically and not from the bottom-right.
+
+For instance, this snippet will instantiates a layout predictor able to detect text on rotated documents while preserving the aspect ratio:
+
+.. code:: python3
+
+    from doctr.models import layout_predictor
+    predictor = layout_predictor('lw_detr_s', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True)
+
+
 End-to-End OCR
 --------------
 

diff --git a/doctr/datasets/coco_text.py b/doctr/datasets/coco_text.py
@@ -27,10 +27,12 @@ class COCOTEXT(AbstractDataset):
     >>> from doctr.datasets import COCOTEXT
     >>> train_set = COCOTEXT(train=True, img_folder="/path/to/coco_text/train2014/",
     >>>                     label_path="/path/to/coco_text/cocotext.v2.json")
-    >>> img, target = train_set[0]
+    >>> sample = train_set[0]
+    >>> img, target = sample.image, sample.target
     >>> test_set = COCOTEXT(train=False, img_folder="/path/to/coco_text/train2014/",
     >>> label_path = "/path/to/coco_text/cocotext.v2.json")
-    >>> img, target = test_set[0]
+    >>> sample = test_set[0]
+    >>> img, target = sample.image, sample.target
 
     Args:
         img_folder: folder with all the images of the dataset

diff --git a/doctr/datasets/cord.py b/doctr/datasets/cord.py
@@ -26,7 +26,8 @@ class CORD(VisionDataset):
 
     >>> from doctr.datasets import CORD
     >>> train_set = CORD(train=True, download=True)
-    >>> img, target = train_set[0]
+    >>> sample = train_set[0]
+    >>> img, target = sample.image, sample.target
 
     Args:
         train: whether the subset should be the training one

diff --git a/doctr/datasets/datasets/base.py b/doctr/datasets/datasets/base.py
@@ -9,12 +9,8 @@
 from pathlib import Path
 from typing import Any
 
-import numpy as np
-
 from doctr.io.image import get_img_shape
-from doctr.utils.data import download_from_url
-
-from ...models.utils import _copy_tensor
+from doctr.utils import Sample, download_from_url
 
 __all__ = ["_AbstractDataset", "_VisionDataset"]
 
@@ -26,8 +22,8 @@
     def __init__(
         self,
         root: str | Path,
-        img_transforms: Callable[[Any], Any] | None = None,
-        sample_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
+        img_transforms: Callable[[Sample], Sample] | None = None,
+        sample_transforms: Callable[[Sample], Sample] | None = None,
         pre_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
     ) -> None:
         if not Path(root).is_dir():
@@ -45,32 +41,24 @@
     def _read_sample(self, index: int) -> tuple[Any, Any]:
         raise NotImplementedError
 
-    def __getitem__(self, index: int) -> tuple[Any, Any]:
+    def __getitem__(self, index: int) -> Sample:
         # Read image
         img, target = self._read_sample(index)
+        mask = None
+
         # Pre-transforms (format conversion at run-time etc.)
         if self._pre_transforms is not None:
             img, target = self._pre_transforms(img, target)
 
+        sample = Sample(image=img, mask=mask, target=target)
+
         if self.img_transforms is not None:
-            # typing issue cf. https://github.com/python/mypy/issues/5485
-            img = self.img_transforms(img)
+            sample = self.img_transforms(sample)
 
         if self.sample_transforms is not None:
-            # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks.
-            if (
-                isinstance(target, dict)
-                and all(isinstance(item, np.ndarray) for item in target.values())
-                and set(target.keys()) != {"boxes", "labels"}  # avoid confusion with obj detection target
-            ):
-                img_transformed = _copy_tensor(img)
-                for class_name, bboxes in target.items():
-                    img_transformed, target[class_name] = self.sample_transforms(img, bboxes)
-                img = img_transformed
-            else:
-                img, target = self.sample_transforms(img, target)
-
-        return img, target
+            sample = self.sample_transforms(sample)
+
+        return sample
 
     def extra_repr(self) -> str:
         return ""

diff --git a/doctr/datasets/datasets/pytorch.py b/doctr/datasets/datasets/pytorch.py
@@ -11,6 +11,7 @@
 import torch
 
 from doctr.io import read_img_as_tensor, tensor_from_numpy
+from doctr.utils import Sample
 
 from .base import _AbstractDataset, _VisionDataset
 
@@ -48,11 +49,20 @@ def _read_sample(self, index: int) -> tuple[torch.Tensor, Any]:
         return img, deepcopy(target)
 
     @staticmethod
-    def collate_fn(samples: list[tuple[torch.Tensor, Any]]) -> tuple[torch.Tensor, list[Any]]:
-        images, targets = zip(*samples)
-        images = torch.stack(images, dim=0)  # type: ignore[assignment]
+    def collate_fn(
+        samples: list[Sample],
+    ) -> tuple[torch.Tensor, list[Any]] | tuple[tuple[torch.Tensor, torch.Tensor], list[Any]]:
+        _images = [s.image for s in samples]
+        targets = [s.target for s in samples]
 
-        return images, list(targets)  # type: ignore[return-value]
+        _masks = [s.mask for s in samples if s.mask is not None]
+
+        images = torch.stack(_images, dim=0)
+        if _masks:
+            masks = torch.stack(_masks, dim=0)
+            return (images, masks), targets
+
+        return images, targets
 
 
 class VisionDataset(AbstractDataset, _VisionDataset):  # noqa: D101

diff --git a/doctr/datasets/detection.py b/doctr/datasets/detection.py
@@ -23,7 +23,8 @@ class DetectionDataset(AbstractDataset):
     >>> from doctr.datasets import DetectionDataset
     >>> train_set = DetectionDataset(img_folder="/path/to/images",
     >>>                              label_path="/path/to/labels.json")
-    >>> img, target = train_set[0]
+    >>> sample = train_set[0]
+    >>> img, target = sample.image, sample.target
 
     Args:
         img_folder: folder with all the images of the dataset

diff --git a/doctr/datasets/doc_artefacts.py b/doctr/datasets/doc_artefacts.py
@@ -23,7 +23,8 @@ class DocArtefacts(VisionDataset):
 
     >>> from doctr.datasets import DocArtefacts
     >>> train_set = DocArtefacts(train=True, download=True)
-    >>> img, target = train_set[0]
+    >>> sample = train_set[0]
+    >>> img, target = sample.image, sample.target
 
     Args:
         train: whether the subset should be the training one

diff --git a/doctr/datasets/funsd.py b/doctr/datasets/funsd.py
@@ -26,7 +26,8 @@ class FUNSD(VisionDataset):
 
     >>> from doctr.datasets import FUNSD
     >>> train_set = FUNSD(train=True, download=True)
-    >>> img, target = train_set[0]
+    >>> sample = train_set[0]
+    >>> img, target = sample.image, sample.target
 
     Args:
         train: whether the subset should be the training one

diff --git a/doctr/datasets/generator/base.py b/doctr/datasets/generator/base.py
@@ -10,6 +10,7 @@
 from PIL import Image, ImageDraw
 
 from doctr.io.image import tensor_from_pil
+from doctr.utils import Sample
 from doctr.utils.fonts import get_font
 
 from ..datasets import AbstractDataset
@@ -62,7 +63,7 @@ def __init__(
         cache_samples: bool = False,
         font_family: str | list[str] | None = None,
         img_transforms: Callable[[Any], Any] | None = None,
-        sample_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
+        sample_transforms: Callable[[Sample], Sample] | None = None,
     ) -> None:
         self.vocab = vocab
         self._num_samples = num_samples
@@ -111,7 +112,7 @@ def __init__(
         cache_samples: bool = False,
         font_family: str | list[str] | None = None,
         img_transforms: Callable[[Any], Any] | None = None,
-        sample_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
+        sample_transforms: Callable[[Sample], Sample] | None = None,
     ) -> None:
         self.vocab = vocab
         self.wordlen_range = (min_chars, max_chars)