@@ -467,6 +467,35 @@ struct SchemaElement {
467467 10: optional LogicalType logicalType
468468}
469469
470+ struct SchemaElementV3 {
471+ /** Data type for this field. */
472+ 1: optional Type type ;
473+
474+ /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
475+ *
476+ * CHANGED from v1: this must be omitted for other column types.
477+ */
478+ 2: optional i32 type_length ;
479+
480+ /** repetition of the field. */
481+ 3: optional FieldRepetitionType repetition_type ;
482+
483+ /** Name of the field in the schema */
484+ 4: required string name ;
485+
486+ /** Nested fields. */
487+ 5: optional i32 num_children ;
488+
489+ /** CHANGED from v1: from i32 to i64
490+ */
491+ 6: optional i64 field_id ;
492+
493+ /** The logical type of this SchemaElement */
494+ 7: optional LogicalType logicalType
495+
496+ /** REMOVED from v1: converted_type, scale, precision (obsolete) */
497+ }
498+
470499/**
471500 * Encodings supported by Parquet. Not all encodings are valid for all types. These
472501 * enums are also used to specify the encoding of definition and repetition levels.
@@ -835,6 +864,63 @@ struct ColumnMetaData {
835864 16: optional SizeStatistics size_statistics ;
836865}
837866
867+ struct ColumnChunkMetaDataV3 {
868+ /** REMOVED from v1: type (redundant with SchemaElementV3) */
869+ /** REMOVED from v1: encodings (unnecessary and non-trivial to get right) */
870+ /** REMOVED from v1: path_in_schema (unnecessary and wasteful) */
871+ /** REMOVED from v1: index_page_offset (unused in practice?) */
872+ /** REMOVED from v1: statistics (use ColumnIndex and/or page-level statistics instead) */
873+
874+ /** Compression codec **/
875+ 1: required CompressionCodec codec
876+
877+ /** Number of values in this column chunk **/
878+ 2: required i64 num_values
879+
880+ /** total byte size of all uncompressed pages in this column chunk (including the headers) **/
881+ 3: required i64 total_uncompressed_size
882+
883+ /** total byte size of all compressed, and potentially encrypted, pages
884+ * in this column chunk (including the headers) **/
885+ 4: required i64 total_compressed_size
886+
887+ /** Optional key/value metadata for this column chunk.
888+ ** CHANGED from v1: only use this for chunk-specific metadata, otherwise
889+ ** use `FileColumnMetadataV3.key_value_metadata`.
890+ **/
891+ 5: optional list<KeyValue> key_value_metadata
892+
893+ /** Byte offset from beginning of file to first data page **/
894+ 6: required i64 data_page_offset
895+
896+ /** Byte offset from the beginning of file to first (only) dictionary page **/
897+ 7: optional i64 dictionary_page_offset
898+
899+ /** Set of all encodings used for pages in this column chunk.
900+ * This information can be used to determine if all data pages are
901+ * dictionary encoded for example **/
902+ 8: optional list<PageEncodingStats> encoding_stats ;
903+
904+ /** Byte offset from beginning of file to Bloom filter data. **/
905+ 9: optional i64 bloom_filter_offset ;
906+
907+ /** Size of Bloom filter data including the serialized header, in bytes.
908+ * Added in 2.10 so readers may not read this field from old files and
909+ * it can be obtained after the BloomFilterHeader has been deserialized.
910+ * Writers should write this field so readers can read the bloom filter
911+ * in a single I/O.
912+ */
913+ 10: optional i32 bloom_filter_length ;
914+
915+ /**
916+ * Optional statistics to help estimate total memory when converted to in-memory
917+ * representations. The histograms contained in these statistics can
918+ * also be useful in some cases for more fine-grained nullability/list length
919+ * filter pushdown.
920+ */
921+ 11: optional SizeStatistics size_statistics ;
922+ }
923+
838924struct EncryptionWithFooterKey {
839925}
840926
@@ -885,6 +971,44 @@ struct ColumnChunk {
885971 9: optional binary encrypted_column_metadata
886972}
887973
974+ struct ColumnChunkV3 {
975+ /** File where column data is stored. **/
976+ 1: optional string file_path
977+
978+ /** Byte offset in file_path to the ColumnChunkMetaDataV3, optionally encrypted
979+ ** CHANGED from v1: renamed to metadata_file_offset
980+ **/
981+ 2: required i64 metadata_file_offset
982+
983+ /** NEW from v1: Byte length in file_path of ColumnChunkMetaDataV3, optionally encrypted
984+ **/
985+ 3: required i64 metadata_file_length
986+
987+ /** REMOVED from v1: meta_data, encrypted_column_metadata.
988+ ** Use encoded_metadata instead.
989+ **/
990+
991+ /** NEW from v1: Column metadata for this chunk, duplicated here from file_path.
992+ ** This is a Thrift-encoded ColumnChunkMetaDataV3, optionally encrypted.
993+ **/
994+ 4: optional binary encoded_metadata
995+
996+ /** CHANGED from v1: this is now required **/
997+ 5: required i64 offset_index_offset
998+
999+ /** CHANGED from v1: this is now required **/
1000+ 6: required i32 offset_index_length
1001+
1002+ /** File offset of ColumnChunk's ColumnIndex **/
1003+ 7: optional i64 column_index_offset
1004+
1005+ /** Size of ColumnChunk's ColumnIndex, in bytes **/
1006+ 8: optional i32 column_index_length
1007+
1008+ /** Crypto metadata of encrypted columns **/
1009+ 9: optional ColumnCryptoMetaData crypto_metadata
1010+ }
1011+
8881012struct RowGroup {
8891013 /** Metadata for each column chunk in this row group.
8901014 * This list must have the same order as the SchemaElement list in FileMetaData.
@@ -914,6 +1038,32 @@ struct RowGroup {
9141038 7: optional i16 ordinal
9151039}
9161040
1041+ struct RowGroupV3 {
1042+ /** REMOVED from v1: columns.
1043+ * Instead, decode each FileColumnMetadataV3 individually as needed.
1044+ */
1045+
1046+ /** Total byte size of all the uncompressed column data in this row group **/
1047+ 1: required i64 total_byte_size
1048+
1049+ /** Number of rows in this row group **/
1050+ 2: required i64 num_rows
1051+
1052+ /** If set, specifies a sort ordering of the rows in this row group. */
1053+ 3: optional list<SortingColumn> sorting_columns
1054+
1055+ /** REMOVED from v1: file_offset.
1056+ * Use the OffsetIndex for each column instead.
1057+ */
1058+
1059+ /** Total byte size of all compressed (and potentially encrypted) column data
1060+ * in this row group **/
1061+ 4: optional i64 total_compressed_size
1062+
1063+ /** Row group ordinal in the file **/
1064+ 5: optional i16 ordinal
1065+ }
1066+
9171067/** Empty struct to signal the order defined by the physical or logical type */
9181068struct TypeDefinedOrder {}
9191069
@@ -1165,6 +1315,62 @@ struct FileMetaData {
11651315 9: optional binary footer_signing_key_metadata
11661316}
11671317
1318+ /** Metadata for a column in this file. */
1319+ struct FileColumnMetadataV3 {
1320+ /** All column chunks in this file (one per row group) **/
1321+ 1: required list<ColumnChunkV3> columns
1322+
1323+ /** Sort order used for the Statistics min_value and max_value fields
1324+ **/
1325+ 2: optional ColumnOrder column_order ;
1326+
1327+ /** NEW from v1: Optional key/value metadata for this column at the file level
1328+ **/
1329+ 3: optional list<KeyValue> key_value_metadata
1330+ }
1331+
1332+ struct FileMetaDataV3 {
1333+ /** Version of this file **/
1334+ 1: required i32 version
1335+
1336+ /** Parquet schema for this file **/
1337+ 2: required list<SchemaElementV3> schema ;
1338+
1339+ /** Number of rows in this file **/
1340+ 3: required i64 num_rows
1341+
1342+ /** Row groups in this file **/
1343+ 4: required list<RowGroupV3> row_groups
1344+
1345+ /** Optional key/value metadata for this file. **/
1346+ 5: optional list<KeyValue> key_value_metadata
1347+
1348+ /** String for application that wrote this file. **/
1349+ 6: optional string created_by
1350+
1351+ /** NEW from v1: byte offset of FileColumnMetadataV3, for each column **/
1352+ 7: required list<i64> file_column_metadata_offset ;
1353+ /** NEW from v1: byte length of FileColumnMetadataV3, for each column **/
1354+ 8: required list<i64> file_column_metadata_length ;
1355+
1356+ /** REMOVED from v1: column_orders.
1357+ ** Use `FileColumnMetadataV3.column_order` instead.
1358+ **/
1359+
1360+ /**
1361+ * Encryption algorithm. This field is set only in encrypted files
1362+ * with plaintext footer. Files with encrypted footer store algorithm id
1363+ * in FileCryptoMetaData structure.
1364+ */
1365+ 9: optional EncryptionAlgorithm encryption_algorithm
1366+
1367+ /**
1368+ * Retrieval metadata of key used for signing the footer.
1369+ * Used only in encrypted files with plaintext footer.
1370+ */
1371+ 10: optional binary footer_signing_key_metadata
1372+ }
1373+
11681374/** Crypto metadata for files with encrypted footer **/
11691375struct FileCryptoMetaData {
11701376 /**
0 commit comments