From 43fde4500a2e9fb7ccace21b36e388398a854629 Mon Sep 17 00:00:00 2001 From: ChiLin Chiu Date: Sat, 21 Feb 2026 18:34:31 +0800 Subject: [PATCH 1/2] doc: add pyarrow.parquet.filters_to_expression example Signed-off-by: ChiLin Chiu --- docs/source/python/dataset.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index 4e18ea0a51c..fca34ea852d 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -569,6 +569,28 @@ calculate the average of a column without loading the entire column into memory: ... count += batch.num_rows >>> mean_a = col2_sum/count +The ``filter`` argument of :meth:`Dataset.to_batches` (and :func:`~Dataset.to_table`) +expects a boolean :class:`~pyarrow.dataset.Expression`, which can be constructed using +:func:`pyarrow.dataset.field` and its operator overloads. However, if you already have +filters in the DNF (Disjunctive Normal Form) list-of-tuples format accepted by +:class:`pyarrow.parquet.ParquetDataset`, you can convert them to an ``Expression`` +using :func:`pyarrow.parquet.filters_to_expression`: + +.. code-block:: python + + >>> import pyarrow.parquet as pq + >>> import pyarrow.compute as pc + >>> filters = [("a", ">=", 5), ("c", "==", 2)] + >>> filter_expr = pq.filters_to_expression(filters) + >>> filter_expr + = 5) and (c == 2))> + >>> a_sum = 0 + >>> for batch in dataset.to_batches(columns=["a"], filter=filter_expr): + ... if batch.num_rows: + ... a_sum += pc.sum(batch.column("a")).as_py() + >>> a_sum + 21 + Customizing the batch size ~~~~~~~~~~~~~~~~~~~~~~~~~~ From 6e04566c4b421aee4c6c15130be9923b9fe50622 Mon Sep 17 00:00:00 2001 From: ChiLin Chiu Date: Sun, 22 Feb 2026 13:05:43 +0800 Subject: [PATCH 2/2] doc: fix doctest error Signed-off-by: ChiLin Chiu --- docs/source/python/dataset.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index fca34ea852d..65dfda7d4a7 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -580,6 +580,7 @@ using :func:`pyarrow.parquet.filters_to_expression`: >>> import pyarrow.parquet as pq >>> import pyarrow.compute as pc + >>> dataset = ds.dataset(base / "parquet_dataset", format="parquet") >>> filters = [("a", ">=", 5), ("c", "==", 2)] >>> filter_expr = pq.filters_to_expression(filters) >>> filter_expr