Skip to content

Commit 6d65eaa

Browse files
committed
Fix uint16 overflow bug in max vector
size and add independent reader/writer verification tests
1 parent 0c393e0 commit 6d65eaa

7 files changed

Lines changed: 767 additions & 335 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpConstants.java

Lines changed: 13 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -37,102 +37,46 @@ private AlpConstants() {
3737
// Utility class
3838
}
3939

40-
// ========== Page Header Constants ==========
41-
42-
/** Current ALP format version */
40+
// Page header fields
4341
public static final int ALP_VERSION = 1;
44-
45-
/** ALP compression mode identifier (0 = ALP) */
4642
public static final int ALP_COMPRESSION_MODE = 0;
47-
48-
/** Frame of Reference encoding for integers (0 = Frame of Reference) */
4943
public static final int ALP_INTEGER_ENCODING_FOR = 0;
50-
51-
/** Size of the ALP page header in bytes */
5244
public static final int ALP_HEADER_SIZE = 8;
5345

54-
// ========== Vector Size Constants ==========
55-
56-
/** Default number of elements per compressed vector (2^10 = 1024) */
5746
public static final int DEFAULT_VECTOR_SIZE = 1024;
58-
59-
/** Log2 of the default vector size */
6047
public static final int DEFAULT_VECTOR_SIZE_LOG = 10;
6148

62-
/** Maximum allowed log2 of vector size */
63-
static final int MAX_LOG_VECTOR_SIZE = 16;
64-
65-
/** Minimum allowed log2 of vector size */
49+
// Capped at 15 (vectorSize=32768) because num_exceptions is uint16,
50+
// so vectorSize must not exceed 65535 to avoid overflow when all values are exceptions.
51+
static final int MAX_LOG_VECTOR_SIZE = 15;
6652
static final int MIN_LOG_VECTOR_SIZE = 3;
6753

68-
// ========== Exponent/Factor Limits ==========
69-
70-
/** Maximum exponent for float encoding (10^10 ~ 10 billion) */
7154
static final int FLOAT_MAX_EXPONENT = 10;
72-
73-
/** Maximum exponent for double encoding (10^18 ~ 1 quintillion) */
7455
static final int DOUBLE_MAX_EXPONENT = 18;
7556

76-
// ========== Sampling Constants ==========
77-
78-
/** Number of sample vectors used for preset caching */
57+
// Preset caching: full search for the first N vectors, then lock in the top combos
7958
static final int SAMPLER_SAMPLE_VECTORS = 8;
80-
81-
/** Maximum (exponent, factor) combinations to keep in preset */
8259
static final int MAX_PRESET_COMBINATIONS = 5;
8360

84-
// ========== Fast Rounding Magic Numbers ==========
85-
86-
/**
87-
* Magic number for fast float rounding using the floating-point trick.
88-
* Formula: 2^22 + 2^23 = 12,582,912
89-
*/
90-
static final float MAGIC_FLOAT = 12_582_912.0f;
91-
92-
/**
93-
* Magic number for fast double rounding using the floating-point trick.
94-
* Formula: 2^51 + 2^52 = 6,755,399,441,055,744
95-
*/
96-
static final double MAGIC_DOUBLE = 6_755_399_441_055_744.0;
97-
98-
// ========== Metadata Sizes ==========
99-
100-
/** Size of AlpInfo structure in bytes (exponent:1 + factor:1 + num_exceptions:2) */
101-
public static final int ALP_INFO_SIZE = 4;
61+
// Magic numbers for the fast-rounding trick (see ALP paper, Section 3.2)
62+
static final float MAGIC_FLOAT = 12_582_912.0f; // 2^22 + 2^23
63+
static final double MAGIC_DOUBLE = 6_755_399_441_055_744.0; // 2^51 + 2^52
10264

103-
/** Size of ForInfo structure for float (frame_of_reference:4 + bit_width:1) */
104-
public static final int FLOAT_FOR_INFO_SIZE = 5;
65+
// Per-vector metadata sizes in bytes
66+
public static final int ALP_INFO_SIZE = 4; // exponent(1) + factor(1) + num_exceptions(2)
67+
public static final int FLOAT_FOR_INFO_SIZE = 5; // frame_of_reference(4) + bit_width(1)
68+
public static final int DOUBLE_FOR_INFO_SIZE = 9; // frame_of_reference(8) + bit_width(1)
10569

106-
/** Size of ForInfo structure for double (frame_of_reference:8 + bit_width:1) */
107-
public static final int DOUBLE_FOR_INFO_SIZE = 9;
108-
109-
// ========== Precomputed Powers of 10 ==========
110-
111-
/** Precomputed powers of 10 for float encoding (10^0 to 10^10) */
11270
static final float[] FLOAT_POW10 = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
11371

114-
/** Precomputed powers of 10 for double encoding (10^0 to 10^18) */
11572
static final double[] DOUBLE_POW10 = {
11673
1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18
11774
};
11875

119-
// ========== Bit Masks for Negative Zero Detection ==========
120-
121-
/** Bit pattern for negative zero in float */
12276
static final int FLOAT_NEGATIVE_ZERO_BITS = 0x80000000;
123-
124-
/** Bit pattern for negative zero in double */
12577
static final long DOUBLE_NEGATIVE_ZERO_BITS = 0x8000000000000000L;
12678

127-
// ========== Validation ==========
128-
129-
/**
130-
* Validate that a vector size is a power of 2 and within the allowed range.
131-
*
132-
* @param vectorSize the vector size to validate
133-
* @return the validated vector size
134-
* @throws IllegalArgumentException if the vector size is invalid
135-
*/
79+
/** Validates vector size: must be a power of 2 in [2^MIN_LOG .. 2^MAX_LOG]. */
13680
static int validateVectorSize(int vectorSize) {
13781
Preconditions.checkArgument(
13882
vectorSize > 0 && (vectorSize & (vectorSize - 1)) == 0,

0 commit comments

Comments
 (0)