From 4a716a2cf5950769d0eb1b6e5fd697af0c4105ca Mon Sep 17 00:00:00 2001
From: Colton Loftus <70598503+C-Loftus@users.noreply.github.com>
Date: Mon, 9 Mar 2026 11:01:10 -0400
Subject: [PATCH 1/3] parquet documentation and batch configuration

---
 docs/source/publishing/ogcapi-features.rst | 27 ++++++++++++++++--
 pygeoapi/provider/parquet.py               | 32 ++++++++++++++++++++--
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/docs/source/publishing/ogcapi-features.rst b/docs/source/publishing/ogcapi-features.rst
index df40d27bd..2db2205b3 100644
--- a/docs/source/publishing/ogcapi-features.rst
+++ b/docs/source/publishing/ogcapi-features.rst
@@ -585,7 +585,7 @@ To publish a GeoParquet file (with a geometry column) the geopandas package is a
       - type: feature
         name: Parquet
         data: 
-          source: ./tests/data/parquet/random.parquet
+          source: ./tests/data/parquet/naive/random.parquet
         id_field: id
         time_field: time
         x_field:
@@ -595,11 +595,34 @@ To publish a GeoParquet file (with a geometry column) the geopandas package is a
           - minlat
           - maxlat
 
-For GeoParquet data, the `x_field` and `y_field` must be specified in the provider definition,
+For older versions of parquet data that don't comply to GeoParquet v1.1, the `x_field` and `y_field` must be specified in the provider definition,
 and they must be arrays of two column names that contain the x and y coordinates of the
 bounding box of each geometry. If the geometries in the data are all points, the `x_field` and `y_field`
 can be strings instead of arrays and refer to a single column each.
 
+.. code-block:: yaml 
+
+    providers:
+      - type: feature
+        name: Parquet
+        id_field: id
+        data: 
+          source:  ./tests/data/parquet/geoparquet1.1/nyc_subset_overture.parquet
+          batch_size: 10000
+          batch_readahead: 2
+
+
+For GeoParquet data which complies to spec version 1.1, all geometry metadata will be automatically
+detected. 
+
+Note that for any version of parquet, you may optionally specify ``batch_size`` and ``batch_readahead`` in the ``data`` section of the parquet provider config.
+``batch_size`` controls how many rows are fetched per batch. Large batch sizes speed up data processing, but add more I/O, increase latency when fetching data from an object store, and increase memory usage. If not defined it will 
+default to 20,000 rows. 
+
+``batch_readahead`` controls how many batches are buffered in memory. If not specified it will default to 2. 
+Since OGC API Features payloads are often paginated and fairly small, it generally makes sense to specify a small number to avoid reading too many batches ahead of time, especially when fetching from an object store.
+
+
 .. _PostgreSQL:
 
 PostgreSQL
diff --git a/pygeoapi/provider/parquet.py b/pygeoapi/provider/parquet.py
index 8d69e9940..120a21182 100644
--- a/pygeoapi/provider/parquet.py
+++ b/pygeoapi/provider/parquet.py
@@ -108,7 +108,8 @@ def __init__(self, provider_def):
             name: Parquet
             data:
                 source: s3://example.com/parquet_directory/
-
+                batch_size: 10000
+                batch_readahead: 2
             id_field: gml_id
 
 
@@ -121,6 +122,23 @@ def __init__(self, provider_def):
 
         # Source url is required
         self.source = self.data.get('source')
+        # When iterating over a dataset, the batch size
+        # controls how many records are read at a time;
+        # a larger batch size can reduce latency for large
+        # requests the cost of memory and potentially overfetching
+        # the default batch size for pyarrow is 131_072 as specified
+        # by the following link:
+        # https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scanner # noqa
+        # This can potentially be reduced if fetching the dataset from
+        # an object store
+        self.batch_size = self.data.get("batch_size", 20_000)
+
+        # Batch readahead is the number of batches to prefetch
+        # this adds extra memory but can reduce latency for large
+        # or complicated queries; in an OGC API Features context,
+        # it generally makes sense to have some buffering but keep it
+        # low since most responses are small
+        self.batch_readahead = self.data.get('batch_readahead', 2)
         if not self.source:
             msg = 'Need explicit "source" attr in data' \
                   ' field of provider config'
@@ -136,7 +154,8 @@ def __init__(self, provider_def):
             self.fs = None
 
         # Build pyarrow dataset pointing to the data
-        self.ds = pyarrow.dataset.dataset(self.source, filesystem=self.fs)
+        self.ds: pyarrow.dataset.Dataset = \
+            pyarrow.dataset.dataset(self.source, filesystem=self.fs)
 
         if not self.id_field:
             LOGGER.info(
@@ -231,6 +250,11 @@ def _read_parquet(self, return_scanner=False, **kwargs):
         :returns: generator of RecordBatch with the queried values
         """
         scanner = self.ds.scanner(
+            batch_size=self.batch_size,
+            # default batch readahead is 16 which is generally
+            # far too high in a server context; we can safely set it
+            # to 2 which allows for queueing without excessive reads
+            batch_readahead=self.batch_readahead,
             use_threads=True,
             **kwargs
         )
@@ -573,7 +597,9 @@ def _response_feature_hits(self, filter):
 
         try:
             scanner = pyarrow.dataset.Scanner.from_dataset(
-                self.ds, filter=filter
+                self.ds, filter=filter,
+                batch_size=self.batch_size,
+                batch_readahead=self.batch_readahead
             )
             return {
                 'type': 'FeatureCollection',

From 9f9a10353c4c54efff9060f4912a0d468c726bc0 Mon Sep 17 00:00:00 2001
From: Colton Loftus <70598503+C-Loftus@users.noreply.github.com>
Date: Mon, 9 Mar 2026 11:06:56 -0400
Subject: [PATCH 2/3] single quotes

---
 pygeoapi/provider/parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pygeoapi/provider/parquet.py b/pygeoapi/provider/parquet.py
index 120a21182..ade0e27f0 100644
--- a/pygeoapi/provider/parquet.py
+++ b/pygeoapi/provider/parquet.py
@@ -131,7 +131,7 @@ def __init__(self, provider_def):
         # https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scanner # noqa
         # This can potentially be reduced if fetching the dataset from
         # an object store
-        self.batch_size = self.data.get("batch_size", 20_000)
+        self.batch_size = self.data.get('batch_size', 20_000)
 
         # Batch readahead is the number of batches to prefetch
         # this adds extra memory but can reduce latency for large

From 917085653133a1363ad1efc50a33f6b6934f04fa Mon Sep 17 00:00:00 2001
From: Colton Loftus <70598503+C-Loftus@users.noreply.github.com>
Date: Mon, 9 Mar 2026 11:11:43 -0400
Subject: [PATCH 3/3] fix typo

---
 docs/source/publishing/ogcapi-features.rst |  2 +-
 pygeoapi/provider/parquet.py               | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/publishing/ogcapi-features.rst b/docs/source/publishing/ogcapi-features.rst
index 2db2205b3..8a38e0b98 100644
--- a/docs/source/publishing/ogcapi-features.rst
+++ b/docs/source/publishing/ogcapi-features.rst
@@ -616,7 +616,7 @@ For GeoParquet data which complies to spec version 1.1, all geometry metadata wi
 detected. 
 
 Note that for any version of parquet, you may optionally specify ``batch_size`` and ``batch_readahead`` in the ``data`` section of the parquet provider config.
-``batch_size`` controls how many rows are fetched per batch. Large batch sizes speed up data processing, but add more I/O, increase latency when fetching data from an object store, and increase memory usage. If not defined it will 
+``batch_size`` controls how many rows are fetched per batch. Large batch sizes speed up data processing, but add more I/O time like increased latency when fetching data from an object store, and . If not defined it will 
 default to 20,000 rows. 
 
 ``batch_readahead`` controls how many batches are buffered in memory. If not specified it will default to 2. 
diff --git a/pygeoapi/provider/parquet.py b/pygeoapi/provider/parquet.py
index ade0e27f0..8413963e0 100644
--- a/pygeoapi/provider/parquet.py
+++ b/pygeoapi/provider/parquet.py
@@ -124,17 +124,17 @@ def __init__(self, provider_def):
         self.source = self.data.get('source')
         # When iterating over a dataset, the batch size
         # controls how many records are read at a time;
-        # a larger batch size can reduce latency for large
-        # requests the cost of memory and potentially overfetching
-        # the default batch size for pyarrow is 131_072 as specified
-        # by the following link:
+        # a larger batch size can reduce latency for large/complex
+        # requests at the cost of more memory usage
+        # and potentially overfetching;
+        # More information on batching can be found here:
         # https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scanner # noqa
-        # This can potentially be reduced if fetching the dataset from
-        # an object store
+        # This value can be reduced to decrease network transfer
+        # if fetching data from an object store
         self.batch_size = self.data.get('batch_size', 20_000)
 
-        # Batch readahead is the number of batches to prefetch
-        # this adds extra memory but can reduce latency for large
+        # batch_readahead is the number of batches to prefetch;
+        # This adds extra memory but can reduce latency for large
         # or complicated queries; in an OGC API Features context,
         # it generally makes sense to have some buffering but keep it
         # low since most responses are small