Skip to content

Commit b5e495d

Browse files
authored
GH-48858: [C++][Parquet] Avoid re-serializing footer for signature verification (#48859)
### Rationale for this change When reading an encrypted Parquet file with a plaintext footer, the Parquet reader is able to verify footer integrity by comparing the signature in the file with the one computed by encrypting the footer. However, the way it does this is to first re-serializes the deserialized footer using Thrift. This has several issues: 1. it's inefficient 2. it's not obvious that it will always produce the same Thrift encoding as the original, leading to spurious signature verification failures 3. if the original footer deserializes to invalid enum values, attempting to serialize it again will lead to undefined behavior Reason 3 is what allowed this to be uncovered by OSS-Fuzz (see https://oss-fuzz.com/testcase-detail/4740205688193024). This PR switches to reusing the original serialized metadata. ### Are these changes tested? Yes, by existing tests and new fuzz regression file. ### Are there any user-facing changes? No. * GitHub Issue: #48858 Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent c5b3622 commit b5e495d

3 files changed

Lines changed: 99 additions & 71 deletions

File tree

cpp/src/parquet/file_reader.cc

Lines changed: 55 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ using arrow::internal::AddWithOverflow;
5555

5656
namespace parquet {
5757

58+
using ::arrow::Future;
59+
using ::arrow::Result;
60+
using ::arrow::Status;
61+
5862
namespace {
5963
bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) {
6064
// Check the encoding_stats to see if all data pages are dictionary encoded.
@@ -398,7 +402,7 @@ class SerializedFile : public ParquetFileReader::Contents {
398402
PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges));
399403
}
400404

401-
::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
405+
Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
402406
const std::vector<int>& row_groups, const std::vector<int>& column_indices,
403407
int64_t hole_size_limit, int64_t range_size_limit) {
404408
std::vector<::arrow::io::ReadRange> ranges;
@@ -413,10 +417,10 @@ class SerializedFile : public ParquetFileReader::Contents {
413417
range_size_limit);
414418
}
415419

416-
::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
417-
const std::vector<int>& column_indices) const {
420+
Future<> WhenBuffered(const std::vector<int>& row_groups,
421+
const std::vector<int>& column_indices) const {
418422
if (!cached_source_) {
419-
return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered");
423+
return Status::Invalid("Must call PreBuffer before WhenBuffered");
420424
}
421425
std::vector<::arrow::io::ReadRange> ranges;
422426
for (int row : row_groups) {
@@ -465,23 +469,8 @@ class SerializedFile : public ParquetFileReader::Contents {
465469
// Fall through
466470
}
467471

468-
const uint32_t read_metadata_len = ParseUnencryptedFileMetadata(
469-
metadata_buffer, metadata_len, std::move(file_decryptor));
470-
auto file_decryption_properties = properties_.file_decryption_properties();
471-
if (is_encrypted_footer) {
472-
// Nothing else to do here.
473-
return;
474-
} else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
475-
if (file_decryption_properties != nullptr) {
476-
if (!file_decryption_properties->plaintext_files_allowed()) {
477-
throw ParquetException("Applying decryption properties on plaintext file");
478-
}
479-
}
480-
} else {
481-
// Encrypted file with plaintext footer mode.
482-
ParseMetaDataOfEncryptedFileWithPlaintextFooter(
483-
file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
484-
}
472+
ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer,
473+
std::move(file_decryptor));
485474
}
486475

487476
// Validate the source size and get the initial read size.
@@ -522,16 +511,15 @@ class SerializedFile : public ParquetFileReader::Contents {
522511
}
523512

524513
// Does not throw.
525-
::arrow::Future<> ParseMetaDataAsync() {
514+
Future<> ParseMetaDataAsync() {
526515
int64_t footer_read_size;
527516
BEGIN_PARQUET_CATCH_EXCEPTIONS
528517
footer_read_size = GetFooterReadSize();
529518
END_PARQUET_CATCH_EXCEPTIONS
530519
// Assumes this is kept alive externally
531520
return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size)
532-
.Then([this,
533-
footer_read_size](const std::shared_ptr<::arrow::Buffer>& footer_buffer)
534-
-> ::arrow::Future<> {
521+
.Then([this, footer_read_size](
522+
const std::shared_ptr<::arrow::Buffer>& footer_buffer) -> Future<> {
535523
uint32_t metadata_len;
536524
BEGIN_PARQUET_CATCH_EXCEPTIONS
537525
metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
@@ -557,7 +545,7 @@ class SerializedFile : public ParquetFileReader::Contents {
557545
}
558546

559547
// Continuation
560-
::arrow::Future<> ParseMaybeEncryptedMetaDataAsync(
548+
Future<> ParseMaybeEncryptedMetaDataAsync(
561549
std::shared_ptr<::arrow::Buffer> footer_buffer,
562550
std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size,
563551
uint32_t metadata_len) {
@@ -580,26 +568,30 @@ class SerializedFile : public ParquetFileReader::Contents {
580568
file_decryptor = std::move(file_decryptor)](
581569
const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
582570
// Continue and read the file footer
583-
return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer,
584-
file_decryptor);
571+
BEGIN_PARQUET_CATCH_EXCEPTIONS
572+
ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer,
573+
file_decryptor);
574+
END_PARQUET_CATCH_EXCEPTIONS
575+
return Status::OK();
585576
});
586577
}
587-
return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len,
588-
is_encrypted_footer, std::move(file_decryptor));
578+
BEGIN_PARQUET_CATCH_EXCEPTIONS
579+
ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer,
580+
std::move(file_decryptor));
581+
END_PARQUET_CATCH_EXCEPTIONS
582+
return Status::OK();
589583
}
590584

591585
// Continuation
592-
::arrow::Status ParseMetaDataFinal(
593-
std::shared_ptr<::arrow::Buffer> metadata_buffer, uint32_t metadata_len,
594-
const bool is_encrypted_footer,
595-
std::shared_ptr<InternalFileDecryptor> file_decryptor) {
596-
BEGIN_PARQUET_CATCH_EXCEPTIONS
586+
void ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer,
587+
uint32_t metadata_len, const bool is_encrypted_footer,
588+
std::shared_ptr<InternalFileDecryptor> file_decryptor) {
597589
const uint32_t read_metadata_len = ParseUnencryptedFileMetadata(
598590
metadata_buffer, metadata_len, std::move(file_decryptor));
599591
auto file_decryption_properties = properties_.file_decryption_properties();
600592
if (is_encrypted_footer) {
601593
// Nothing else to do here.
602-
return ::arrow::Status::OK();
594+
return;
603595
} else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
604596
if (file_decryption_properties != nullptr) {
605597
if (!file_decryption_properties->plaintext_files_allowed()) {
@@ -611,8 +603,6 @@ class SerializedFile : public ParquetFileReader::Contents {
611603
ParseMetaDataOfEncryptedFileWithPlaintextFooter(
612604
file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
613605
}
614-
END_PARQUET_CATCH_EXCEPTIONS
615-
return ::arrow::Status::OK();
616606
}
617607

618608
private:
@@ -707,20 +697,16 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter(
707697
auto file_decryptor = std::make_shared<InternalFileDecryptor>(
708698
file_decryption_properties, file_aad, algo.algorithm,
709699
file_metadata_->footer_signing_key_metadata(), properties_.memory_pool());
710-
// set the InternalFileDecryptor in the metadata as well, as it's used
711-
// for signature verification and for ColumnChunkMetaData creation.
712-
file_metadata_->set_file_decryptor(std::move(file_decryptor));
700+
// Set the InternalFileDecryptor in the metadata as well, as it's used
701+
// for ColumnChunkMetaData creation.
702+
file_metadata_->set_file_decryptor(file_decryptor);
713703

714704
if (file_decryption_properties->check_plaintext_footer_integrity()) {
715-
if (metadata_len - read_metadata_len !=
716-
(parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) {
717-
throw ParquetInvalidOrCorruptedFileException(
718-
"Failed reading metadata for encryption signature (requested ",
719-
parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength,
720-
" bytes but have ", metadata_len - read_metadata_len, " bytes)");
721-
}
722-
723-
if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) {
705+
auto serialized_metadata =
706+
metadata_buffer->span_as<uint8_t>().subspan(0, read_metadata_len);
707+
auto signature = metadata_buffer->span_as<uint8_t>().subspan(read_metadata_len);
708+
if (!FileMetaData::VerifySignature(serialized_metadata, signature,
709+
file_decryptor.get())) {
724710
throw ParquetInvalidOrCorruptedFileException(
725711
"Parquet crypto signature verification failed");
726712
}
@@ -804,7 +790,7 @@ std::unique_ptr<ParquetFileReader::Contents> ParquetFileReader::Contents::Open(
804790
return result;
805791
}
806792

807-
::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>
793+
Future<std::unique_ptr<ParquetFileReader::Contents>>
808794
ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
809795
const ReaderProperties& props,
810796
std::shared_ptr<FileMetaData> metadata) {
@@ -815,7 +801,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
815801
if (metadata == nullptr) {
816802
// TODO(ARROW-12259): workaround since we have Future<(move-only type)>
817803
struct {
818-
::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
804+
Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
819805
return std::move(result);
820806
}
821807

@@ -825,7 +811,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
825811
return file->ParseMetaDataAsync().Then(std::move(Continuation));
826812
} else {
827813
file->set_metadata(std::move(metadata));
828-
return ::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
814+
return Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
829815
std::move(result));
830816
}
831817
END_PARQUET_CATCH_EXCEPTIONS
@@ -855,24 +841,24 @@ std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
855841
return Open(std::move(source), props, std::move(metadata));
856842
}
857843

858-
::arrow::Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
844+
Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
859845
std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
860846
std::shared_ptr<FileMetaData> metadata) {
861847
BEGIN_PARQUET_CATCH_EXCEPTIONS
862848
auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata));
863849
// TODO(ARROW-12259): workaround since we have Future<(move-only type)>
864-
auto completed = ::arrow::Future<std::unique_ptr<ParquetFileReader>>::Make();
865-
fut.AddCallback([fut, completed](
866-
const ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>>&
867-
contents) mutable {
868-
if (!contents.ok()) {
869-
completed.MarkFinished(contents.status());
870-
return;
871-
}
872-
std::unique_ptr<ParquetFileReader> result = std::make_unique<ParquetFileReader>();
873-
result->Open(fut.MoveResult().MoveValueUnsafe());
874-
completed.MarkFinished(std::move(result));
875-
});
850+
auto completed = Future<std::unique_ptr<ParquetFileReader>>::Make();
851+
fut.AddCallback(
852+
[fut, completed](
853+
const Result<std::unique_ptr<ParquetFileReader::Contents>>& contents) mutable {
854+
if (!contents.ok()) {
855+
completed.MarkFinished(contents.status());
856+
return;
857+
}
858+
std::unique_ptr<ParquetFileReader> result = std::make_unique<ParquetFileReader>();
859+
result->Open(fut.MoveResult().MoveValueUnsafe());
860+
completed.MarkFinished(std::move(result));
861+
});
876862
return completed;
877863
END_PARQUET_CATCH_EXCEPTIONS
878864
}
@@ -919,7 +905,7 @@ void ParquetFileReader::PreBuffer(const std::vector<int>& row_groups,
919905
file->PreBuffer(row_groups, column_indices, ctx, options);
920906
}
921907

922-
::arrow::Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadRanges(
908+
Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadRanges(
923909
const std::vector<int>& row_groups, const std::vector<int>& column_indices,
924910
int64_t hole_size_limit, int64_t range_size_limit) {
925911
// Access private methods here
@@ -929,8 +915,8 @@ ::arrow::Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadR
929915
range_size_limit);
930916
}
931917

932-
::arrow::Future<> ParquetFileReader::WhenBuffered(
933-
const std::vector<int>& row_groups, const std::vector<int>& column_indices) const {
918+
Future<> ParquetFileReader::WhenBuffered(const std::vector<int>& row_groups,
919+
const std::vector<int>& column_indices) const {
934920
// Access private methods here
935921
SerializedFile* file =
936922
::arrow::internal::checked_cast<SerializedFile*>(contents_.get());

cpp/src/parquet/metadata.cc

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,6 +1169,42 @@ void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
11691169
return impl_->WriteTo(dst, encryptor);
11701170
}
11711171

1172+
bool FileMetaData::VerifySignature(std::span<const uint8_t> serialized_metadata,
1173+
std::span<const uint8_t> signature,
1174+
InternalFileDecryptor* file_decryptor) {
1175+
DCHECK_NE(file_decryptor, nullptr);
1176+
1177+
// In plaintext footer, the "signature" is the concatenation of the nonce used
1178+
// for GCM encryption, and the authentication tag obtained after GCM encryption.
1179+
if (signature.size() != encryption::kGcmTagLength + encryption::kNonceLength) {
1180+
throw ParquetInvalidOrCorruptedFileException(
1181+
"Invalid footer encryption signature (expected ",
1182+
encryption::kGcmTagLength + encryption::kNonceLength, " bytes, got ",
1183+
signature.size(), ")");
1184+
}
1185+
1186+
// Encrypt plaintext serialized metadata so as to compute its signature
1187+
auto nonce = signature.subspan(0, encryption::kNonceLength);
1188+
auto tag = signature.subspan(encryption::kNonceLength);
1189+
const SecureString& key = file_decryptor->GetFooterKey();
1190+
const std::string& aad = encryption::CreateFooterAad(file_decryptor->file_aad());
1191+
1192+
auto aes_encryptor = encryption::AesEncryptor::Make(
1193+
file_decryptor->algorithm(), static_cast<int>(key.size()), /*metadata=*/true,
1194+
/*write_length=*/false);
1195+
1196+
std::shared_ptr<Buffer> encrypted_buffer =
1197+
AllocateBuffer(file_decryptor->pool(),
1198+
aes_encryptor->CiphertextLength(serialized_metadata.size()));
1199+
int32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
1200+
serialized_metadata, key.as_span(), str2span(aad), nonce,
1201+
encrypted_buffer->mutable_span_as<uint8_t>());
1202+
DCHECK_EQ(encrypted_len, encrypted_buffer->size());
1203+
// Check computed signature against expected
1204+
return 0 == memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength,
1205+
tag.data(), encryption::kGcmTagLength);
1206+
}
1207+
11721208
class FileCryptoMetaData::FileCryptoMetaDataImpl {
11731209
public:
11741210
FileCryptoMetaDataImpl() = default;

cpp/src/parquet/metadata.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <map>
2222
#include <memory>
2323
#include <optional>
24+
#include <span>
2425
#include <string>
2526
#include <vector>
2627

@@ -322,8 +323,8 @@ class PARQUET_EXPORT FileMetaData {
322323
EncryptionAlgorithm encryption_algorithm() const;
323324
const std::string& footer_signing_key_metadata() const;
324325

325-
/// \brief Verify signature of FileMetaData when file is encrypted but footer
326-
/// is not encrypted (plaintext footer).
326+
PARQUET_DEPRECATED(
327+
"Deprecated in 24.0.0. If you need this functionality, please report an issue.")
327328
bool VerifySignature(const void* signature);
328329

329330
void WriteTo(::arrow::io::OutputStream* dst,
@@ -383,6 +384,11 @@ class PARQUET_EXPORT FileMetaData {
383384
void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
384385
const std::shared_ptr<InternalFileDecryptor>& file_decryptor() const;
385386

387+
// Verify the signature of a plaintext footer.
388+
static bool VerifySignature(std::span<const uint8_t> serialized_metadata,
389+
std::span<const uint8_t> signature,
390+
InternalFileDecryptor* file_decryptor);
391+
386392
// PIMPL Idiom
387393
FileMetaData();
388394
class FileMetaDataImpl;

0 commit comments

Comments
 (0)