apache · mapleFU · Apr 6, 2026 · Feb 20, 2026 · Feb 23, 2026 · Feb 23, 2026
@@ -56,7 +56,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
     write_page_checksum=*,
     sorting_columns=*,
     store_decimal_as_integer=*,
-    use_content_defined_chunking=*
+    use_content_defined_chunking=*,
+    bloom_filter_options=*
 ) except *
 
 

@@ -51,6 +51,10 @@ cimport cpython as cp
 _DEFAULT_ROW_GROUP_SIZE = 1024*1024
 _MAX_ROW_GROUP_SIZE = 64*1024*1024
 
+# from definition of BloomFilterOptions struct
+_DEFAULT_BLOOM_FILTER_NDV = 1024*1024
+_DEFAULT_BLOOM_FILTER_FPP = 0.05
+
 
 cdef Type _unwrap_list_type(obj) except *:
     if obj is ListType:
@@ -1992,13 +1996,15 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
         write_page_checksum=False,
         sorting_columns=None,
         store_decimal_as_integer=False,
-        use_content_defined_chunking=False) except *:
+        use_content_defined_chunking=False,
+        bloom_filter_options=None) except *:
 
     """General writer properties"""
     cdef:
         shared_ptr[WriterProperties] properties
         WriterProperties.Builder props
         CdcOptions cdc_options
+        BloomFilterOptions bloom_opts
 
     # data_page_version
 
@@ -2122,6 +2128,48 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
             raise TypeError(
                 "'column_encoding' should be a dictionary or a string")
 
+    # bloom filters
+    if bloom_filter_options is not None:
+        if isinstance(bloom_filter_options, dict):
+            # for each entry in bloom_filter_options, {"path": {"ndv": ndv, "fpp", fpp}}
+            # convert (ndv,fpp) to BloomFilterOptions struct and pass to props
+            for column, _bloom_opts in bloom_filter_options.items():
+                # set defaults
+                bloom_opts.ndv = _DEFAULT_BLOOM_FILTER_NDV
+                bloom_opts.fpp = _DEFAULT_BLOOM_FILTER_FPP
+                if isinstance(_bloom_opts, dict):
+                    if "ndv" in _bloom_opts:
+                        ndv = _bloom_opts["ndv"]
+                        if isinstance(ndv, int):
+                            if ndv < 0:
+                                raise ValueError(
+                                    f"'ndv' for column '{column}' must be positive, got {ndv}")
+                            bloom_opts.ndv = ndv
+                        else:
+                            raise TypeError(
+                                f"'ndv' for column '{column}' must be an int")
+                    if "fpp" in _bloom_opts:
+                        fpp = _bloom_opts["fpp"]
+                        if isinstance(fpp, float):
+                            if fpp <= 0.0 or fpp >= 1.0:
+                                raise ValueError(
+                                    f"'fpp' for column '{column}' must be in (0.0, 1,0), got {fpp}")
+                            bloom_opts.fpp = fpp
+                        else:
+                            raise TypeError(
+                                f"'fpp' for column '{column}' must be a float")
+                elif isinstance(_bloom_opts, bool):
+                    if not _bloom_opts:
+                        props.disable_bloom_filter(tobytes(column))
+                        continue
+                else:
+                    raise TypeError(
+                        f"'bloom_filter_options:{column}' must be a boolean or a dictionary")
+
+                props.enable_bloom_filter(tobytes(column), bloom_opts)
+        else:
+            raise TypeError("'bloom_filter_options' must be a dictionary")
+
     # size limits
     if data_page_size is not None:
         props.data_pagesize(data_page_size)
@@ -2317,7 +2365,8 @@ cdef class ParquetWriter(_Weakrefable):
                   sorting_columns=None,
                   store_decimal_as_integer=False,
                   use_content_defined_chunking=False,
-                  write_time_adjusted_to_utc=False):
+                  write_time_adjusted_to_utc=False,
+                  bloom_filter_options=None):
         cdef:
             shared_ptr[WriterProperties] properties
             shared_ptr[ArrowWriterProperties] arrow_properties
@@ -2353,7 +2402,8 @@ cdef class ParquetWriter(_Weakrefable):
             write_page_checksum=write_page_checksum,
             sorting_columns=sorting_columns,
             store_decimal_as_integer=store_decimal_as_integer,
-            use_content_defined_chunking=use_content_defined_chunking
+            use_content_defined_chunking=use_content_defined_chunking,
+            bloom_filter_options=bloom_filter_options
         )
         arrow_properties = _create_arrow_writer_properties(
             use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,

@@ -464,6 +464,10 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
 
 
 cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
+    cdef cppclass BloomFilterOptions:
+        int32_t ndv
+        double fpp
+
     cdef cppclass CdcOptions:
         int64_t min_chunk_size
         int64_t max_chunk_size
@@ -506,6 +510,9 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
             Builder* enable_content_defined_chunking()
             Builder* disable_content_defined_chunking()
             Builder* content_defined_chunking_options(CdcOptions options)
+            Builder* disable_bloom_filter(const c_string& path)
+            Builder* enable_bloom_filter(const c_string& path,
+                                         BloomFilterOptions bloom_filter_options)
             shared_ptr[WriterProperties] build()
 
     cdef cppclass ArrowWriterProperties:

@@ -951,6 +951,30 @@ def _sanitize_table(table, new_schema, flavor):
     are expressed in reference to midnight in the UTC timezone.
     If False (the default), the TIME columns are assumed to be expressed
     in reference to midnight in an unknown, presumably local, timezone.
+bloom_filter_options : dict, default None
+    Create Bloom filters for the columns specified by the provided `dict`.
+
+    Bloom filters can be configured with two parameters: number of distinct values
+    (NDV), and false-positive probability (FPP).
+
+    Bloom filters are most effective for high-cardinality columns. A good default
+    is to set NDV equal to the number of rows. Lower values reduce disk usage but
+    may not be worthwhile for very small NDVs. Increasing NDV (without increasing FPP)
+    increases disk and memory usage.
+
+    Lower FPP values require more disk and memory space. For a fixed NDV, the
+    space requirement grows roughly proportional to log(1/FPP). Recommended
+    values are 0.1, 0.05, or 0.01. Very small values are counterproductive as
+    the bitset may exceed the size of the actual data. Set NDV appropriately
+    to minimize space usage.
+
+    The keys of the `dict` are column paths. For each path, the value can be either:
+
+    - A boolean, with ``True`` indicating that a Bloom filter should be produced with
+      the default values of `NDV=1048576` and `FPP=0.05`.
+    - A dictionary, with keys `ndv` and `fpp`. `ndv` must be a positive integer, and
+      `fpp` must be a float between 0.0 and 1.0. Default values will be used for any
+      missing keys.
 """
 
 _parquet_writer_example_doc = """\
@@ -1980,6 +2004,7 @@ def write_table(table, where, row_group_size=None, version='2.6',
                 store_decimal_as_integer=False,
                 write_time_adjusted_to_utc=False,
                 max_rows_per_page=None,
+                bloom_filter_options=None,
                 **kwargs):
     # Implementor's note: when adding keywords here / updating defaults, also
     # update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions
@@ -2013,6 +2038,7 @@ def write_table(table, where, row_group_size=None, version='2.6',
                 store_decimal_as_integer=store_decimal_as_integer,
                 write_time_adjusted_to_utc=write_time_adjusted_to_utc,
                 max_rows_per_page=max_rows_per_page,
+                bloom_filter_options=bloom_filter_options,
                 **kwargs) as writer:
             writer.write_table(table, row_group_size=row_group_size)
     except Exception:

@@ -19,6 +19,7 @@
 import sys
 from collections import OrderedDict
 import io
+import re
 import warnings
 from shutil import copytree
 from decimal import Decimal
@@ -620,6 +621,70 @@ def test_lz4_raw_compression_alias():
     _check_roundtrip(table, expected=table, compression="LZ4_RAW")
 
 
+def test_bloom_filter_options():
+    arr_int = pa.array(list(map(int, range(100))))
+    arr_bin = pa.array([str(x) for x in range(100)], type=pa.binary())
+    data = [arr_int, arr_bin]
+    table = pa.Table.from_arrays(data, names=['a', 'b'])
+
+    # bloom filter for one column
+    _check_roundtrip(table, expected=table, bloom_filter_options={
+                     'a': {'ndv': 100, 'fpp': 0.05}})
+
+    # bloom filter for two columns
+    _check_roundtrip(table, expected=table, bloom_filter_options={
+                     'a': {'ndv': 100, 'fpp': 0.05}, 'b': {'ndv': 10, 'fpp': 0.1}})
+
+    # bloom filter for one column with default ndv
+    _check_roundtrip(table, expected=table, bloom_filter_options={
+                     'a': {'fpp': 0.05}})
+
+    # bloom filter for one column with default fpp
+    _check_roundtrip(table, expected=table, bloom_filter_options={
+                     'a': {'ndv': 100}})
+
+    # bloom filter for one column with default ndv and fpp
+    _check_roundtrip(table, expected=table, bloom_filter_options={
+                     'a': {}})
+    _check_roundtrip(table, expected=table, bloom_filter_options={
+                     'a': True})
+
+    # should remain disabled
+    _check_roundtrip(table, expected=table, bloom_filter_options={
+                     'a': False})
+
+    # wrong type for ndv
+    buf = io.BytesIO()
+    with pytest.raises(TypeError, match="'ndv' for column 'a' must be an int"):
+        _write_table(table, buf, bloom_filter_options={
+                     'a': {'ndv': '100', 'fpp': 0.05}})
+
+    # wrong type for fpp
+    with pytest.raises(TypeError, match="'fpp' for column 'a' must be a float"):
+        _write_table(table, buf, bloom_filter_options={
+                     'a': {'ndv': 100, 'fpp': '0.05'}})
+
+    # wrong type for options
+    with pytest.raises(TypeError, match="'bloom_filter_options' must be a dictionary"):
+        _write_table(table, buf, bloom_filter_options=True)
+
+    # invalid ndv value
+    with pytest.raises(ValueError,
+                       match="'ndv' for column 'a' must be positive, got -10"):
+        _write_table(table, buf, bloom_filter_options={
+                     'a': {'ndv': -10}})
+
+    # invalid fpp values
+    with pytest.raises(ValueError, match=re.escape(
+            "'fpp' for column 'a' must be in (0.0, 1,0), got 2.0")):
+        _write_table(table, buf, bloom_filter_options={
+                     'a': {'fpp': 2.0}})
+    with pytest.raises(ValueError, match=re.escape(
+            "'fpp' for column 'a' must be in (0.0, 1,0), got -0.5")):
+        _write_table(table, buf, bloom_filter_options={
+                     'a': {'fpp': -0.5}})
+
+
 def test_sanitized_spark_field_names():
     a0 = pa.array([0, 1, 2, 3, 4])
     name = 'prohib; ,\t{}'