The writer's data bytes (the RLE-encoded indices) and the dictionary + * page are returned separately so both pieces can be measured or fed to a + * decoder symmetrically. The dictionary page buffer is copied so it remains + * valid after the writer's allocator is released. + * + *
The writer is closed via {@code toDictPageAndClose()}; callers must not + * call {@link DictionaryValuesWriter#close()} again afterwards. + */ + static EncodedDictionary drainDictionary(DictionaryValuesWriter writer) throws IOException { + byte[] dictData = writer.getBytes().toByteArray(); + DictionaryPage rawPage = writer.toDictPageAndClose(); + DictionaryPage dictPage = rawPage == null ? null : rawPage.copy(); + return new EncodedDictionary(dictData, dictPage); + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryDecodingBenchmark.java new file mode 100644 index 0000000000..8b05893933 --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryDecodingBenchmark.java @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.concurrent.TimeUnit; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.HeapByteBufferAllocator; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.values.dictionary.DictionaryValuesReader; +import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter; +import org.apache.parquet.column.values.dictionary.PlainValuesDictionary; +import org.apache.parquet.io.api.Binary; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OperationsPerInvocation; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Decoding-level micro-benchmarks for the DICTIONARY encoding across all Parquet + * types that support it: {@code INT32}, {@code INT64}, {@code FLOAT}, + * {@code DOUBLE}, {@code BINARY}, and {@code FIXED_LEN_BYTE_ARRAY}. + * Encoding benchmarks live in {@link DictionaryEncodingBenchmark}. + * + *
Each type group uses its own inner {@link State} class with independent + * {@code @Param} dimensions to avoid JMH cross-product pollution: + *
Decode benchmarks measure the {@link DictionaryValuesReader} lookup path. + * Each invocation decodes {@value #VALUE_COUNT} values; throughput is reported + * per-value via {@link OperationsPerInvocation}. + */ +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@Fork(1) +@Warmup(iterations = 3, time = 1) +@Measurement(iterations = 5, time = 1) +public class DictionaryDecodingBenchmark { + + static final int VALUE_COUNT = 100_000; + private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024; + + // ==== Fixed-width numeric types (parameterised by dataPattern) ==== + + @State(Scope.Thread) + public static class NumericState { + @Param({"LOW_CARDINALITY", "HIGH_CARDINALITY"}) + public String dataPattern; + + // Pre-encoded dictionary pages + byte[] intDictDataEncoded; + Dictionary intDictionary; + boolean intDictAvailable; + + byte[] longDictDataEncoded; + Dictionary longDictionary; + boolean longDictAvailable; + + byte[] floatDictDataEncoded; + Dictionary floatDictionary; + boolean floatDictAvailable; + + byte[] doubleDictDataEncoded; + Dictionary doubleDictionary; + boolean doubleDictAvailable; + + @Setup(Level.Trial) + public void setup() throws IOException { + int distinct = "LOW_CARDINALITY".equals(dataPattern) + ? TestDataFactory.LOW_CARDINALITY_DISTINCT + : 0; // 0 = all unique for HIGH_CARDINALITY + + long seed = TestDataFactory.DEFAULT_SEED; + + int[] intData; + long[] longData; + float[] floatData; + double[] doubleData; + + if (distinct > 0) { + intData = TestDataFactory.generateLowCardinalityInts(VALUE_COUNT, distinct, seed); + longData = TestDataFactory.generateLowCardinalityLongs(VALUE_COUNT, distinct, seed); + floatData = TestDataFactory.generateLowCardinalityFloats(VALUE_COUNT, distinct, seed); + doubleData = TestDataFactory.generateLowCardinalityDoubles(VALUE_COUNT, distinct, seed); + } else { + intData = TestDataFactory.generateRandomInts(VALUE_COUNT, seed); + longData = TestDataFactory.generateRandomLongs(VALUE_COUNT, seed); + floatData = TestDataFactory.generateRandomFloats(VALUE_COUNT, seed); + doubleData = TestDataFactory.generateRandomDoubles(VALUE_COUNT, seed); + } + + setupIntDict(intData); + setupLongDict(longData); + setupFloatDict(floatData); + setupDoubleDict(doubleData); + } + + private void setupIntDict(int[] data) throws IOException { + DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter w = + new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter( + MAX_DICT_BYTE_SIZE, + Encoding.PLAIN_DICTIONARY, + Encoding.PLAIN, + new HeapByteBufferAllocator()); + for (int v : data) { + w.writeInteger(v); + } + BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w); + intDictDataEncoded = enc.dictData; + intDictAvailable = !enc.fellBackToPlain(); + if (intDictAvailable) { + intDictionary = new PlainValuesDictionary.PlainIntegerDictionary(enc.dictPage); + } + } + + private void setupLongDict(long[] data) throws IOException { + DictionaryValuesWriter.PlainLongDictionaryValuesWriter w = + new DictionaryValuesWriter.PlainLongDictionaryValuesWriter( + MAX_DICT_BYTE_SIZE, + Encoding.PLAIN_DICTIONARY, + Encoding.PLAIN, + new HeapByteBufferAllocator()); + for (long v : data) { + w.writeLong(v); + } + BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w); + longDictDataEncoded = enc.dictData; + longDictAvailable = !enc.fellBackToPlain(); + if (longDictAvailable) { + longDictionary = new PlainValuesDictionary.PlainLongDictionary(enc.dictPage); + } + } + + private void setupFloatDict(float[] data) throws IOException { + DictionaryValuesWriter.PlainFloatDictionaryValuesWriter w = + new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter( + MAX_DICT_BYTE_SIZE, + Encoding.PLAIN_DICTIONARY, + Encoding.PLAIN, + new HeapByteBufferAllocator()); + for (float v : data) { + w.writeFloat(v); + } + BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w); + floatDictDataEncoded = enc.dictData; + floatDictAvailable = !enc.fellBackToPlain(); + if (floatDictAvailable) { + floatDictionary = new PlainValuesDictionary.PlainFloatDictionary(enc.dictPage); + } + } + + private void setupDoubleDict(double[] data) throws IOException { + DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter w = + new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter( + MAX_DICT_BYTE_SIZE, + Encoding.PLAIN_DICTIONARY, + Encoding.PLAIN, + new HeapByteBufferAllocator()); + for (double v : data) { + w.writeDouble(v); + } + BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w); + doubleDictDataEncoded = enc.dictData; + doubleDictAvailable = !enc.fellBackToPlain(); + if (doubleDictAvailable) { + doubleDictionary = new PlainValuesDictionary.PlainDoubleDictionary(enc.dictPage); + } + } + } + + // ---- Fixed-width numeric decode benchmarks ---- + + @Benchmark + @OperationsPerInvocation(VALUE_COUNT) + public void decodeInt(NumericState state, Blackhole bh) throws IOException { + if (!state.intDictAvailable) return; + DictionaryValuesReader r = new DictionaryValuesReader(state.intDictionary); + r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(state.intDictDataEncoded))); + for (int i = 0; i < VALUE_COUNT; i++) { + bh.consume(r.readInteger()); + } + } + + @Benchmark + @OperationsPerInvocation(VALUE_COUNT) + public void decodeLong(NumericState state, Blackhole bh) throws IOException { + if (!state.longDictAvailable) return; + DictionaryValuesReader r = new DictionaryValuesReader(state.longDictionary); + r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(state.longDictDataEncoded))); + for (int i = 0; i < VALUE_COUNT; i++) { + bh.consume(r.readLong()); + } + } + + @Benchmark + @OperationsPerInvocation(VALUE_COUNT) + public void decodeFloat(NumericState state, Blackhole bh) throws IOException { + if (!state.floatDictAvailable) return; + DictionaryValuesReader r = new DictionaryValuesReader(state.floatDictionary); + r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(state.floatDictDataEncoded))); + for (int i = 0; i < VALUE_COUNT; i++) { + bh.consume(r.readFloat()); + } + } + + @Benchmark + @OperationsPerInvocation(VALUE_COUNT) + public void decodeDouble(NumericState state, Blackhole bh) throws IOException { + if (!state.doubleDictAvailable) return; + DictionaryValuesReader r = new DictionaryValuesReader(state.doubleDictionary); + r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(state.doubleDictDataEncoded))); + for (int i = 0; i < VALUE_COUNT; i++) { + bh.consume(r.readDouble()); + } + } + + // ==== BINARY (parameterised by stringLength and cardinality) ==== + + @State(Scope.Thread) + public static class BinaryState { + @Param({"10", "100", "1000"}) + public int stringLength; + + @Param({"LOW", "HIGH"}) + public String cardinality; + + byte[] dictData; + Dictionary dictionary; + boolean dictAvailable; + + @Setup(Level.Trial) + public void setup() throws IOException { + int distinct = "LOW".equals(cardinality) ? TestDataFactory.LOW_CARDINALITY_DISTINCT : 0; + Binary[] data = TestDataFactory.generateBinaryData( + VALUE_COUNT, stringLength, distinct, TestDataFactory.DEFAULT_SEED); + + DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter w = + new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter( + MAX_DICT_BYTE_SIZE, + Encoding.PLAIN_DICTIONARY, + Encoding.PLAIN, + new HeapByteBufferAllocator()); + for (Binary v : data) { + w.writeBytes(v); + } + BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w); + dictData = enc.dictData; + dictAvailable = !enc.fellBackToPlain(); + if (dictAvailable) { + dictionary = new PlainValuesDictionary.PlainBinaryDictionary(enc.dictPage); + } + } + } + + @Benchmark + @OperationsPerInvocation(VALUE_COUNT) + public void decodeBinary(BinaryState state, Blackhole bh) throws IOException { + if (!state.dictAvailable) return; + DictionaryValuesReader r = new DictionaryValuesReader(state.dictionary); + r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(state.dictData))); + for (int i = 0; i < VALUE_COUNT; i++) { + bh.consume(r.readBytes()); + } + } + + // ==== FIXED_LEN_BYTE_ARRAY (parameterised by fixedLength and cardinality) ==== + + @State(Scope.Thread) + public static class FlbaState { + @Param({"2", "12", "16"}) + public int fixedLength; + + @Param({"LOW", "HIGH"}) + public String cardinality; + + byte[] dictData; + Dictionary dictionary; + boolean dictAvailable; + + @Setup(Level.Trial) + public void setup() throws IOException { + int distinct = "LOW".equals(cardinality) ? TestDataFactory.LOW_CARDINALITY_DISTINCT : 0; + Binary[] data = TestDataFactory.generateFixedLenByteArrays( + VALUE_COUNT, fixedLength, distinct, TestDataFactory.DEFAULT_SEED); + + DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w = + new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter( + MAX_DICT_BYTE_SIZE, + fixedLength, + Encoding.PLAIN_DICTIONARY, + Encoding.PLAIN, + new HeapByteBufferAllocator()); + for (Binary v : data) { + w.writeBytes(v); + } + BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w); + dictData = enc.dictData; + dictAvailable = !enc.fellBackToPlain(); + if (dictAvailable) { + dictionary = new PlainValuesDictionary.PlainBinaryDictionary(enc.dictPage, fixedLength); + } + } + } + + @Benchmark + @OperationsPerInvocation(VALUE_COUNT) + public void decodeFlba(FlbaState state, Blackhole bh) throws IOException { + if (!state.dictAvailable) return; + DictionaryValuesReader r = new DictionaryValuesReader(state.dictionary); + r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(state.dictData))); + for (int i = 0; i < VALUE_COUNT; i++) { + bh.consume(r.readBytes()); + } + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java new file mode 100644 index 0000000000..1549205d3a --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DictionaryEncodingBenchmark.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.apache.parquet.bytes.HeapByteBufferAllocator; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter; +import org.apache.parquet.io.api.Binary; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OperationsPerInvocation; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Encoding-level micro-benchmarks for the DICTIONARY encoding across all Parquet + * types that support it: {@code INT32}, {@code INT64}, {@code FLOAT}, + * {@code DOUBLE}, {@code BINARY}, and {@code FIXED_LEN_BYTE_ARRAY}. + * Decoding benchmarks live in {@link DictionaryDecodingBenchmark}. + * + *
Each type group uses its own inner {@link State} class with independent + * {@code @Param} dimensions to avoid JMH cross-product pollution: + *
Each type's encode benchmark measures the full dictionary-build path
+ * (type-specific hash map + id append). Each invocation encodes
+ * {@value #VALUE_COUNT} values; throughput is reported per-value via
+ * {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+public class DictionaryEncodingBenchmark {
+
+ static final int VALUE_COUNT = 100_000;
+ private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024;
+
+ // ==== Fixed-width numeric types (parameterised by dataPattern) ====
+
+ @State(Scope.Thread)
+ public static class NumericState {
+ @Param({"LOW_CARDINALITY", "HIGH_CARDINALITY"})
+ public String dataPattern;
+
+ int[] intData;
+ long[] longData;
+ float[] floatData;
+ double[] doubleData;
+
+ @Setup(Level.Trial)
+ public void setup() {
+ int distinct = "LOW_CARDINALITY".equals(dataPattern)
+ ? TestDataFactory.LOW_CARDINALITY_DISTINCT
+ : 0; // 0 = all unique for HIGH_CARDINALITY
+
+ long seed = TestDataFactory.DEFAULT_SEED;
+
+ if (distinct > 0) {
+ intData = TestDataFactory.generateLowCardinalityInts(VALUE_COUNT, distinct, seed);
+ longData = TestDataFactory.generateLowCardinalityLongs(VALUE_COUNT, distinct, seed);
+ floatData = TestDataFactory.generateLowCardinalityFloats(VALUE_COUNT, distinct, seed);
+ doubleData = TestDataFactory.generateLowCardinalityDoubles(VALUE_COUNT, distinct, seed);
+ } else {
+ intData = TestDataFactory.generateRandomInts(VALUE_COUNT, seed);
+ longData = TestDataFactory.generateRandomLongs(VALUE_COUNT, seed);
+ floatData = TestDataFactory.generateRandomFloats(VALUE_COUNT, seed);
+ doubleData = TestDataFactory.generateRandomDoubles(VALUE_COUNT, seed);
+ }
+ }
+ }
+
+ @Benchmark
+ @OperationsPerInvocation(VALUE_COUNT)
+ public void encodeInt(NumericState state, Blackhole bh) throws IOException {
+ DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter w =
+ new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(
+ MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+ for (int v : state.intData) {
+ w.writeInteger(v);
+ }
+ BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+ bh.consume(enc.dictData);
+ bh.consume(enc.dictPage);
+ }
+
+ @Benchmark
+ @OperationsPerInvocation(VALUE_COUNT)
+ public void encodeLong(NumericState state, Blackhole bh) throws IOException {
+ DictionaryValuesWriter.PlainLongDictionaryValuesWriter w =
+ new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(
+ MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+ for (long v : state.longData) {
+ w.writeLong(v);
+ }
+ BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+ bh.consume(enc.dictData);
+ bh.consume(enc.dictPage);
+ }
+
+ @Benchmark
+ @OperationsPerInvocation(VALUE_COUNT)
+ public void encodeFloat(NumericState state, Blackhole bh) throws IOException {
+ DictionaryValuesWriter.PlainFloatDictionaryValuesWriter w =
+ new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(
+ MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+ for (float v : state.floatData) {
+ w.writeFloat(v);
+ }
+ BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+ bh.consume(enc.dictData);
+ bh.consume(enc.dictPage);
+ }
+
+ @Benchmark
+ @OperationsPerInvocation(VALUE_COUNT)
+ public void encodeDouble(NumericState state, Blackhole bh) throws IOException {
+ DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter w =
+ new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(
+ MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+ for (double v : state.doubleData) {
+ w.writeDouble(v);
+ }
+ BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+ bh.consume(enc.dictData);
+ bh.consume(enc.dictPage);
+ }
+
+ // ==== BINARY (parameterised by stringLength and cardinality) ====
+
+ @State(Scope.Thread)
+ public static class BinaryState {
+ @Param({"10", "100", "1000"})
+ public int stringLength;
+
+ @Param({"LOW", "HIGH"})
+ public String cardinality;
+
+ Binary[] data;
+
+ @Setup(Level.Trial)
+ public void setup() {
+ int distinct = "LOW".equals(cardinality) ? TestDataFactory.LOW_CARDINALITY_DISTINCT : 0;
+ data = TestDataFactory.generateBinaryData(
+ VALUE_COUNT, stringLength, distinct, TestDataFactory.DEFAULT_SEED);
+ }
+ }
+
+ @Benchmark
+ @OperationsPerInvocation(VALUE_COUNT)
+ public void encodeBinary(BinaryState state, Blackhole bh) throws IOException {
+ DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter w =
+ new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(
+ MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
+ for (Binary v : state.data) {
+ w.writeBytes(v);
+ }
+ BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+ bh.consume(enc.dictData);
+ bh.consume(enc.dictPage);
+ }
+
+ // ==== FIXED_LEN_BYTE_ARRAY (parameterised by fixedLength and cardinality) ====
+
+ @State(Scope.Thread)
+ public static class FlbaState {
+ @Param({"2", "12", "16"})
+ public int fixedLength;
+
+ @Param({"LOW", "HIGH"})
+ public String cardinality;
+
+ Binary[] data;
+
+ @Setup(Level.Trial)
+ public void setup() {
+ int distinct = "LOW".equals(cardinality) ? TestDataFactory.LOW_CARDINALITY_DISTINCT : 0;
+ data = TestDataFactory.generateFixedLenByteArrays(
+ VALUE_COUNT, fixedLength, distinct, TestDataFactory.DEFAULT_SEED);
+ }
+ }
+
+ @Benchmark
+ @OperationsPerInvocation(VALUE_COUNT)
+ public void encodeFlba(FlbaState state, Blackhole bh) throws IOException {
+ DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter w =
+ new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(
+ MAX_DICT_BYTE_SIZE,
+ state.fixedLength,
+ Encoding.PLAIN_DICTIONARY,
+ Encoding.PLAIN,
+ new HeapByteBufferAllocator());
+ for (Binary v : state.data) {
+ w.writeBytes(v);
+ }
+ BenchmarkEncodingUtils.EncodedDictionary enc = BenchmarkEncodingUtils.drainDictionary(w);
+ bh.consume(enc.dictData);
+ bh.consume(enc.dictPage);
+ }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
new file mode 100644
index 0000000000..6bd19a0148
--- /dev/null
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Random;
+import org.apache.parquet.io.api.Binary;
+
+/**
+ * Utility class for generating test data for dictionary encoding benchmarks.
+ */
+public final class TestDataFactory {
+
+ /** Number of distinct values for low-cardinality data patterns. */
+ public static final int LOW_CARDINALITY_DISTINCT = 100;
+
+ /** Default RNG seed used across benchmarks for deterministic data. */
+ public static final long DEFAULT_SEED = 42L;
+
+ private TestDataFactory() {}
+
+ // ---- Integer data generation for encoding benchmarks ----
+
+ /**
+ * Generates uniformly random integers using the given seed.
+ */
+ public static int[] generateRandomInts(int count, long seed) {
+ Random random = new Random(seed);
+ int[] data = new int[count];
+ for (int i = 0; i < count; i++) {
+ data[i] = random.nextInt();
+ }
+ return data;
+ }
+
+ /**
+ * Generates low-cardinality integers (values drawn from a small set) using the given seed.
+ */
+ public static int[] generateLowCardinalityInts(int count, int distinctValues, long seed) {
+ Random random = new Random(seed);
+ int[] data = new int[count];
+ for (int i = 0; i < count; i++) {
+ data[i] = random.nextInt(distinctValues);
+ }
+ return data;
+ }
+
+ // ---- Long data generation for encoding benchmarks ----
+
+ /**
+ * Generates uniformly random longs using the given seed.
+ */
+ public static long[] generateRandomLongs(int count, long seed) {
+ Random random = new Random(seed);
+ long[] data = new long[count];
+ for (int i = 0; i < count; i++) {
+ data[i] = random.nextLong();
+ }
+ return data;
+ }
+
+ /**
+ * Generates low-cardinality longs (values drawn from a small set).
+ */
+ public static long[] generateLowCardinalityLongs(int count, int distinctValues, long seed) {
+ Random random = new Random(seed);
+ long[] palette = new long[distinctValues];
+ for (int i = 0; i < distinctValues; i++) {
+ palette[i] = random.nextLong();
+ }
+ long[] data = new long[count];
+ for (int i = 0; i < count; i++) {
+ data[i] = palette[random.nextInt(distinctValues)];
+ }
+ return data;
+ }
+
+ // ---- Float data generation for encoding benchmarks ----
+
+ /**
+ * Generates uniformly random floats using the given seed.
+ */
+ public static float[] generateRandomFloats(int count, long seed) {
+ Random random = new Random(seed);
+ float[] data = new float[count];
+ for (int i = 0; i < count; i++) {
+ data[i] = random.nextFloat() * 1000.0f;
+ }
+ return data;
+ }
+
+ /**
+ * Generates low-cardinality floats (values drawn from a small set).
+ */
+ public static float[] generateLowCardinalityFloats(int count, int distinctValues, long seed) {
+ Random random = new Random(seed);
+ float[] palette = new float[distinctValues];
+ for (int i = 0; i < distinctValues; i++) {
+ palette[i] = random.nextFloat() * 1000.0f;
+ }
+ float[] data = new float[count];
+ for (int i = 0; i < count; i++) {
+ data[i] = palette[random.nextInt(distinctValues)];
+ }
+ return data;
+ }
+
+ // ---- Double data generation for encoding benchmarks ----
+
+ /**
+ * Generates uniformly random doubles using the given seed.
+ */
+ public static double[] generateRandomDoubles(int count, long seed) {
+ Random random = new Random(seed);
+ double[] data = new double[count];
+ for (int i = 0; i < count; i++) {
+ data[i] = random.nextDouble() * 1000.0;
+ }
+ return data;
+ }
+
+ /**
+ * Generates low-cardinality doubles (values drawn from a small set).
+ */
+ public static double[] generateLowCardinalityDoubles(int count, int distinctValues, long seed) {
+ Random random = new Random(seed);
+ double[] palette = new double[distinctValues];
+ for (int i = 0; i < distinctValues; i++) {
+ palette[i] = random.nextDouble() * 1000.0;
+ }
+ double[] data = new double[count];
+ for (int i = 0; i < count; i++) {
+ data[i] = palette[random.nextInt(distinctValues)];
+ }
+ return data;
+ }
+
+ // ---- Fixed-length byte array data generation for encoding benchmarks ----
+
+ /**
+ * Generates fixed-length byte arrays with the specified cardinality.
+ *
+ * @param count number of values
+ * @param length byte length of each value
+ * @param distinct number of distinct values (0 means all unique)
+ * @param seed RNG seed
+ */
+ public static Binary[] generateFixedLenByteArrays(int count, int length, int distinct, long seed) {
+ Random random = new Random(seed);
+ if (distinct > 0) {
+ Binary[] palette = new Binary[distinct];
+ for (int i = 0; i < distinct; i++) {
+ byte[] bytes = new byte[length];
+ random.nextBytes(bytes);
+ palette[i] = Binary.fromConstantByteArray(bytes);
+ }
+ Binary[] data = new Binary[count];
+ for (int i = 0; i < count; i++) {
+ data[i] = palette[random.nextInt(distinct)];
+ }
+ return data;
+ } else {
+ Binary[] data = new Binary[count];
+ for (int i = 0; i < count; i++) {
+ byte[] bytes = new byte[length];
+ random.nextBytes(bytes);
+ data[i] = Binary.fromConstantByteArray(bytes);
+ }
+ return data;
+ }
+ }
+
+ // ---- Binary data generation for encoding benchmarks ----
+
+ /**
+ * Generates binary strings of the given length with the specified cardinality.
+ *
+ * @param count number of values
+ * @param stringLength length of each string
+ * @param distinct number of distinct values (0 means all unique)
+ * @param seed RNG seed
+ * @return array of Binary values
+ */
+ public static Binary[] generateBinaryData(int count, int stringLength, int distinct, long seed) {
+ Random random = new Random(seed);
+ Binary[] data = new Binary[count];
+ if (distinct > 0) {
+ // Pre-generate the distinct values
+ Binary[] dictionary = new Binary[distinct];
+ for (int i = 0; i < distinct; i++) {
+ dictionary[i] = Binary.fromConstantByteArray(
+ randomString(stringLength, random).getBytes(StandardCharsets.UTF_8));
+ }
+ for (int i = 0; i < count; i++) {
+ data[i] = dictionary[random.nextInt(distinct)];
+ }
+ } else {
+ // All unique
+ for (int i = 0; i < count; i++) {
+ data[i] = Binary.fromConstantByteArray(
+ randomString(stringLength, random).getBytes(StandardCharsets.UTF_8));
+ }
+ }
+ return data;
+ }
+
+ private static String randomString(int length, Random random) {
+ StringBuilder sb = new StringBuilder(length);
+ for (int i = 0; i < length; i++) {
+ sb.append((char) ('a' + random.nextInt(26)));
+ }
+ return sb.toString();
+ }
+}
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
index 53526ae8d0..dbca68e886 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java
@@ -20,23 +20,22 @@
import static org.apache.parquet.bytes.BytesInput.concat;
-import it.unimi.dsi.fastutil.doubles.Double2IntLinkedOpenHashMap;
import it.unimi.dsi.fastutil.doubles.Double2IntMap;
-import it.unimi.dsi.fastutil.doubles.DoubleIterator;
-import it.unimi.dsi.fastutil.floats.Float2IntLinkedOpenHashMap;
+import it.unimi.dsi.fastutil.doubles.Double2IntOpenHashMap;
+import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import it.unimi.dsi.fastutil.floats.Float2IntMap;
-import it.unimi.dsi.fastutil.floats.FloatIterator;
-import it.unimi.dsi.fastutil.ints.Int2IntLinkedOpenHashMap;
+import it.unimi.dsi.fastutil.floats.Float2IntOpenHashMap;
+import it.unimi.dsi.fastutil.floats.FloatArrayList;
import it.unimi.dsi.fastutil.ints.Int2IntMap;
-import it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
+import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.longs.Long2IntMap;
-import it.unimi.dsi.fastutil.longs.LongIterator;
-import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;
+import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
+import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
-import it.unimi.dsi.fastutil.objects.ObjectIterator;
+import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import org.apache.parquet.bytes.ByteBufferAllocator;
import org.apache.parquet.bytes.BytesInput;
@@ -231,7 +230,8 @@ public String memUsageString(String prefix) {
public static class PlainBinaryDictionaryValuesWriter extends DictionaryValuesWriter {
/* type specific dictionary content */
- protected Object2IntMap The sentinel value {@code 0} means "not yet computed". This follows the
+ * {@link String#hashCode()} idiom: races between concurrent first calls are benign
+ * because the computation is deterministic, and a hash that genuinely equals {@code 0}
+ * will simply be recomputed on every call (acceptably rare). Reused instances never
+ * cache (their backing bytes can mutate after construction).
+ * Package-private (rather than private) so subclasses can read it directly without
+ * an extra method call on the {@link #hashCode()} hot path.
+ */
+ transient int cachedHashCode;
+
// this isn't really something others should extend
private Binary() {}
@@ -101,6 +113,18 @@ public boolean equals(Object obj) {
return false;
}
+ /**
+ * Caches {@code hashCode} for non-reused instances and returns it. The cache uses
+ * a single int field with sentinel {@code 0} to remain race-safe without volatile.
+ * If the computed hash is {@code 0}, no caching occurs and the next call recomputes.
+ */
+ final int cacheHashCode(int hashCode) {
+ if (!isBackingBytesReused) {
+ cachedHashCode = hashCode;
+ }
+ return hashCode;
+ }
+
@Override
public String toString() {
return "Binary{" + length()
@@ -180,7 +204,11 @@ public Binary slice(int start, int length) {
@Override
public int hashCode() {
- return Binary.hashCode(value, offset, length);
+ int h = cachedHashCode;
+ if (h != 0) {
+ return h;
+ }
+ return cacheHashCode(Binary.hashCode(value, offset, length));
}
@Override
@@ -340,7 +368,11 @@ public Binary slice(int start, int length) {
@Override
public int hashCode() {
- return Binary.hashCode(value, 0, value.length);
+ int h = cachedHashCode;
+ if (h != 0) {
+ return h;
+ }
+ return cacheHashCode(Binary.hashCode(value, 0, value.length));
}
@Override
@@ -499,11 +531,18 @@ public Binary slice(int start, int length) {
@Override
public int hashCode() {
+ int h = cachedHashCode;
+ if (h != 0) {
+ return h;
+ }
+
+ int computedHashCode;
if (value.hasArray()) {
- return Binary.hashCode(value.array(), value.arrayOffset() + offset, length);
+ computedHashCode = Binary.hashCode(value.array(), value.arrayOffset() + offset, length);
} else {
- return Binary.hashCode(value, offset, length);
+ computedHashCode = Binary.hashCode(value, offset, length);
}
+ return cacheHashCode(computedHashCode);
}
@Override
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java
index 6b5182ab73..74d6375723 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java
@@ -53,8 +53,15 @@ public void testListGreaterThanMaxSlabSize() {
private void doTestIntList(int testSize, int expectedSlabSize) {
IntList testList = new IntList();
+
+ // size() must be 0 before any adds
+ Assert.assertEquals(0, testList.size());
+
populateList(testList, testSize);
+ // size() must match the number of elements added
+ Assert.assertEquals(testSize, testList.size());
+
verifyIteratorResults(testSize, testList);
// confirm the size of the current slab
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
index a91f807e73..2b1cd574f5 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
@@ -43,6 +43,7 @@
import org.apache.parquet.column.values.ValuesWriter;
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter;
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter;
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter;
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter;
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter;
@@ -659,6 +660,73 @@ public void testFloatDictionaryFallBack() throws IOException {
}
}
+ private FallbackValuesWriter