From 4a716a2cf5950769d0eb1b6e5fd697af0c4105ca Mon Sep 17 00:00:00 2001 From: Colton Loftus <70598503+C-Loftus@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:01:10 -0400 Subject: [PATCH 1/3] parquet documentation and batch configuration --- docs/source/publishing/ogcapi-features.rst | 27 ++++++++++++++++-- pygeoapi/provider/parquet.py | 32 ++++++++++++++++++++-- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/docs/source/publishing/ogcapi-features.rst b/docs/source/publishing/ogcapi-features.rst index df40d27bd..2db2205b3 100644 --- a/docs/source/publishing/ogcapi-features.rst +++ b/docs/source/publishing/ogcapi-features.rst @@ -585,7 +585,7 @@ To publish a GeoParquet file (with a geometry column) the geopandas package is a - type: feature name: Parquet data: - source: ./tests/data/parquet/random.parquet + source: ./tests/data/parquet/naive/random.parquet id_field: id time_field: time x_field: @@ -595,11 +595,34 @@ To publish a GeoParquet file (with a geometry column) the geopandas package is a - minlat - maxlat -For GeoParquet data, the `x_field` and `y_field` must be specified in the provider definition, +For older versions of parquet data that don't comply to GeoParquet v1.1, the `x_field` and `y_field` must be specified in the provider definition, and they must be arrays of two column names that contain the x and y coordinates of the bounding box of each geometry. If the geometries in the data are all points, the `x_field` and `y_field` can be strings instead of arrays and refer to a single column each. +.. code-block:: yaml + + providers: + - type: feature + name: Parquet + id_field: id + data: + source: ./tests/data/parquet/geoparquet1.1/nyc_subset_overture.parquet + batch_size: 10000 + batch_readahead: 2 + + +For GeoParquet data which complies to spec version 1.1, all geometry metadata will be automatically +detected. + +Note that for any version of parquet, you may optionally specify ``batch_size`` and ``batch_readahead`` in the ``data`` section of the parquet provider config. +``batch_size`` controls how many rows are fetched per batch. Large batch sizes speed up data processing, but add more I/O, increase latency when fetching data from an object store, and increase memory usage. If not defined it will +default to 20,000 rows. + +``batch_readahead`` controls how many batches are buffered in memory. If not specified it will default to 2. +Since OGC API Features payloads are often paginated and fairly small, it generally makes sense to specify a small number to avoid reading too many batches ahead of time, especially when fetching from an object store. + + .. _PostgreSQL: PostgreSQL diff --git a/pygeoapi/provider/parquet.py b/pygeoapi/provider/parquet.py index 8d69e9940..120a21182 100644 --- a/pygeoapi/provider/parquet.py +++ b/pygeoapi/provider/parquet.py @@ -108,7 +108,8 @@ def __init__(self, provider_def): name: Parquet data: source: s3://example.com/parquet_directory/ - + batch_size: 10000 + batch_readahead: 2 id_field: gml_id @@ -121,6 +122,23 @@ def __init__(self, provider_def): # Source url is required self.source = self.data.get('source') + # When iterating over a dataset, the batch size + # controls how many records are read at a time; + # a larger batch size can reduce latency for large + # requests the cost of memory and potentially overfetching + # the default batch size for pyarrow is 131_072 as specified + # by the following link: + # https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scanner # noqa + # This can potentially be reduced if fetching the dataset from + # an object store + self.batch_size = self.data.get("batch_size", 20_000) + + # Batch readahead is the number of batches to prefetch + # this adds extra memory but can reduce latency for large + # or complicated queries; in an OGC API Features context, + # it generally makes sense to have some buffering but keep it + # low since most responses are small + self.batch_readahead = self.data.get('batch_readahead', 2) if not self.source: msg = 'Need explicit "source" attr in data' \ ' field of provider config' @@ -136,7 +154,8 @@ def __init__(self, provider_def): self.fs = None # Build pyarrow dataset pointing to the data - self.ds = pyarrow.dataset.dataset(self.source, filesystem=self.fs) + self.ds: pyarrow.dataset.Dataset = \ + pyarrow.dataset.dataset(self.source, filesystem=self.fs) if not self.id_field: LOGGER.info( @@ -231,6 +250,11 @@ def _read_parquet(self, return_scanner=False, **kwargs): :returns: generator of RecordBatch with the queried values """ scanner = self.ds.scanner( + batch_size=self.batch_size, + # default batch readahead is 16 which is generally + # far too high in a server context; we can safely set it + # to 2 which allows for queueing without excessive reads + batch_readahead=self.batch_readahead, use_threads=True, **kwargs ) @@ -573,7 +597,9 @@ def _response_feature_hits(self, filter): try: scanner = pyarrow.dataset.Scanner.from_dataset( - self.ds, filter=filter + self.ds, filter=filter, + batch_size=self.batch_size, + batch_readahead=self.batch_readahead ) return { 'type': 'FeatureCollection', From 9f9a10353c4c54efff9060f4912a0d468c726bc0 Mon Sep 17 00:00:00 2001 From: Colton Loftus <70598503+C-Loftus@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:06:56 -0400 Subject: [PATCH 2/3] single quotes --- pygeoapi/provider/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygeoapi/provider/parquet.py b/pygeoapi/provider/parquet.py index 120a21182..ade0e27f0 100644 --- a/pygeoapi/provider/parquet.py +++ b/pygeoapi/provider/parquet.py @@ -131,7 +131,7 @@ def __init__(self, provider_def): # https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scanner # noqa # This can potentially be reduced if fetching the dataset from # an object store - self.batch_size = self.data.get("batch_size", 20_000) + self.batch_size = self.data.get('batch_size', 20_000) # Batch readahead is the number of batches to prefetch # this adds extra memory but can reduce latency for large From 917085653133a1363ad1efc50a33f6b6934f04fa Mon Sep 17 00:00:00 2001 From: Colton Loftus <70598503+C-Loftus@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:11:43 -0400 Subject: [PATCH 3/3] fix typo --- docs/source/publishing/ogcapi-features.rst | 2 +- pygeoapi/provider/parquet.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/publishing/ogcapi-features.rst b/docs/source/publishing/ogcapi-features.rst index 2db2205b3..8a38e0b98 100644 --- a/docs/source/publishing/ogcapi-features.rst +++ b/docs/source/publishing/ogcapi-features.rst @@ -616,7 +616,7 @@ For GeoParquet data which complies to spec version 1.1, all geometry metadata wi detected. Note that for any version of parquet, you may optionally specify ``batch_size`` and ``batch_readahead`` in the ``data`` section of the parquet provider config. -``batch_size`` controls how many rows are fetched per batch. Large batch sizes speed up data processing, but add more I/O, increase latency when fetching data from an object store, and increase memory usage. If not defined it will +``batch_size`` controls how many rows are fetched per batch. Large batch sizes speed up data processing, but add more I/O time like increased latency when fetching data from an object store, and . If not defined it will default to 20,000 rows. ``batch_readahead`` controls how many batches are buffered in memory. If not specified it will default to 2. diff --git a/pygeoapi/provider/parquet.py b/pygeoapi/provider/parquet.py index ade0e27f0..8413963e0 100644 --- a/pygeoapi/provider/parquet.py +++ b/pygeoapi/provider/parquet.py @@ -124,17 +124,17 @@ def __init__(self, provider_def): self.source = self.data.get('source') # When iterating over a dataset, the batch size # controls how many records are read at a time; - # a larger batch size can reduce latency for large - # requests the cost of memory and potentially overfetching - # the default batch size for pyarrow is 131_072 as specified - # by the following link: + # a larger batch size can reduce latency for large/complex + # requests at the cost of more memory usage + # and potentially overfetching; + # More information on batching can be found here: # https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scanner # noqa - # This can potentially be reduced if fetching the dataset from - # an object store + # This value can be reduced to decrease network transfer + # if fetching data from an object store self.batch_size = self.data.get('batch_size', 20_000) - # Batch readahead is the number of batches to prefetch - # this adds extra memory but can reduce latency for large + # batch_readahead is the number of batches to prefetch; + # This adds extra memory but can reduce latency for large # or complicated queries; in an OGC API Features context, # it generally makes sense to have some buffering but keep it # low since most responses are small