Skip to content

Commit a28a952

Browse files
subkanthixieandrew
andauthored
feat: Configurable compression (#115)
Co-authored-by: Andrew Xie <dev@xie.is>
1 parent 33aea41 commit a28a952

5 files changed

Lines changed: 197 additions & 63 deletions

File tree

ice-rest-catalog/src/main/java/com/altinity/ice/rest/catalog/internal/maintenance/DataCompaction.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ private void merge(
177177

178178
Parquet.WriteBuilder writeBuilder =
179179
Parquet.write(outputFile)
180+
.setAll(table.properties())
180181
.overwrite(false)
181182
.createWriterFunc(GenericParquetWriter::buildWriter)
182183
.metricsConfig(metricsConfig)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/bin/bash
2+
set -e
3+
4+
echo "Running insert with compression test..."
5+
6+
SCENARIO_DIR="{{SCENARIO_DIR}}"
7+
INPUT_PATH="${SCENARIO_DIR}/../insert-scan/input.parquet"
8+
9+
# Create namespace and table
10+
{{ICE_CLI}} --config {{CLI_CONFIG}} create-namespace ${NAMESPACE_NAME}
11+
echo "OK Created namespace"
12+
13+
# Verify invalid compression codec is rejected
14+
if {{ICE_CLI}} --config {{CLI_CONFIG}} insert --create-table ${TABLE_NAME} "file://${INPUT_PATH}" --compression=invalid 2>/dev/null; then
15+
echo "FAIL insert should have rejected invalid compression codec"
16+
exit 1
17+
fi
18+
echo "OK Invalid compression codec rejected"
19+
20+
# Insert with zstd compression
21+
{{ICE_CLI}} --config {{CLI_CONFIG}} insert --create-table ${TABLE_NAME} "file://${INPUT_PATH}" --compression=zstd
22+
echo "OK Inserted data with zstd compression"
23+
24+
# Scan to verify data written with zstd is readable
25+
{{ICE_CLI}} --config {{CLI_CONFIG}} scan ${TABLE_NAME} > /tmp/compression_scan1.txt
26+
if ! grep -q "sepal.length" /tmp/compression_scan1.txt; then
27+
echo "FAIL Scan output missing expected column after zstd insert"
28+
cat /tmp/compression_scan1.txt
29+
exit 1
30+
fi
31+
FIRST_LINES=$(wc -l < /tmp/compression_scan1.txt)
32+
echo "OK Scan after zstd insert: ${FIRST_LINES} lines"
33+
34+
# Insert again with snappy compression
35+
{{ICE_CLI}} --config {{CLI_CONFIG}} insert ${TABLE_NAME} "file://${INPUT_PATH}" --compression=snappy
36+
echo "OK Inserted data with snappy compression"
37+
38+
# Scan to verify both zstd and snappy data files are readable together
39+
{{ICE_CLI}} --config {{CLI_CONFIG}} scan ${TABLE_NAME} --limit 500 > /tmp/compression_scan2.txt
40+
SECOND_LINES=$(wc -l < /tmp/compression_scan2.txt)
41+
if [ "${SECOND_LINES}" -le "${FIRST_LINES}" ]; then
42+
echo "FAIL Second scan should have more rows (got ${SECOND_LINES}, first had ${FIRST_LINES})"
43+
exit 1
44+
fi
45+
echo "OK Scan after snappy insert: ${SECOND_LINES} lines"
46+
47+
# Cleanup
48+
{{ICE_CLI}} --config {{CLI_CONFIG}} delete-table ${TABLE_NAME}
49+
{{ICE_CLI}} --config {{CLI_CONFIG}} delete-namespace ${NAMESPACE_NAME}
50+
echo "OK Cleanup done"
51+
52+
echo "Insert with compression test completed successfully"
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
name: "Insert with compression codec"
2+
description: "Tests inserting data with --compression flag (zstd) and verifying data is readable"
3+
4+
catalogConfig:
5+
warehouse: "s3://test-bucket/warehouse"
6+
7+
env:
8+
NAMESPACE_NAME: "test_compression"
9+
TABLE_NAME: "test_compression.iris_zstd"
10+
INPUT_FILE: "input.parquet"

ice/src/main/java/com/altinity/ice/cli/Main.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,11 @@ void insert(
447447
description = "Number of threads to use for inserting data",
448448
defaultValue = "-1")
449449
int threadCount,
450+
@CommandLine.Option(
451+
names = {"--compression"},
452+
description =
453+
"Parquet compression codec: gzip (default), zstd, snappy, lz4, brotli, uncompressed, or as-source")
454+
String compression,
450455
@CommandLine.Option(
451456
names = {"--watch"},
452457
description = "Event queue. Supported: AWS SQS")
@@ -543,6 +548,7 @@ void insert(
543548
.sortOrderList(sortOrders)
544549
.threadCount(
545550
threadCount < 1 ? Runtime.getRuntime().availableProcessors() : threadCount)
551+
.compression(compression)
546552
.build();
547553

548554
if (!watchMode) {

ice/src/main/java/com/altinity/ice/cli/internal/cmd/Insert.java

Lines changed: 128 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@
2828
import java.util.Collections;
2929
import java.util.HashSet;
3030
import java.util.List;
31+
import java.util.Locale;
3132
import java.util.Map;
3233
import java.util.Set;
34+
import java.util.TreeSet;
3335
import java.util.concurrent.ExecutionException;
3436
import java.util.concurrent.ExecutorService;
3537
import java.util.concurrent.Executors;
@@ -76,6 +78,7 @@
7678
import org.apache.iceberg.parquet.Parquet;
7779
import org.apache.iceberg.parquet.ParquetUtil;
7880
import org.apache.iceberg.rest.RESTCatalog;
81+
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
7982
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
8083
import org.apache.parquet.schema.MessageType;
8184
import org.slf4j.Logger;
@@ -108,6 +111,20 @@ public static Result run(
108111
return new Result(0, 0);
109112
}
110113

114+
if (options.compression() != null) {
115+
// Validate against the list of parquet compression codecs supported by Iceberg.
116+
Set<String> validCompressionCodecs =
117+
Arrays.stream(CompressionCodecName.values())
118+
.map(c -> c.name().toLowerCase(Locale.ENGLISH))
119+
.collect(Collectors.toCollection(HashSet::new));
120+
validCompressionCodecs.add("as-source");
121+
if (!validCompressionCodecs.contains(options.compression().toLowerCase(Locale.ENGLISH))) {
122+
String accepted = String.join(", ", new TreeSet<>(validCompressionCodecs));
123+
throw new IllegalArgumentException(
124+
"Unknown --compression value: " + options.compression() + ". Accepted: " + accepted);
125+
}
126+
}
127+
111128
Table table = catalog.loadTable(nsTable);
112129

113130
// Create transaction and pass it to updatePartitionAndSortOrderMetadata
@@ -501,72 +518,98 @@ private static List<DataFile> processFile(
501518
.build());
502519
dataFileSizeInBytes = inputFile.getLength();
503520
dataFile = dstDataFile;
504-
} else if (partitionSpec.isPartitioned() && partitionKey == null) {
505-
return copyPartitionedAndSorted(
506-
file,
507-
tableSchema,
508-
partitionSpec,
509-
sortOrder,
510-
metricsConfig,
511-
tableIO,
512-
inputFile,
513-
dstDataFileSource);
514-
} else if (sortOrder.isSorted() && !sorted) {
515-
return Collections.singletonList(
516-
copySorted(
517-
file,
518-
dstDataFileSource.get(file),
519-
tableSchema,
520-
partitionSpec,
521-
sortOrder,
522-
metricsConfig,
523-
tableIO,
524-
inputFile,
525-
dataFileNamingStrategy,
526-
partitionKey));
527521
} else {
528-
// Table isn't partitioned or sorted. Copy as is.
529-
String dstDataFile;
530-
if (partitionSpec.isPartitioned() && partitionKey != null) {
531-
// File has inferred partition, use partition path
532-
dstDataFile = dstDataFileSource.get(partitionSpec, partitionKey, file);
533-
} else {
534-
dstDataFile = dstDataFileSource.get(file);
535-
}
536-
if (checkNotExists.apply(dstDataFile)) {
537-
return Collections.emptyList();
522+
// Copy path: compute compression override from CLI or as-source
523+
String compressionCodecOverride = null;
524+
if (options.compression() != null) {
525+
if ("as-source".equalsIgnoreCase(options.compression())) {
526+
var blocks = metadata.getBlocks();
527+
if (!blocks.isEmpty()) {
528+
compressionCodecOverride =
529+
blocks.get(0).getColumns().get(0).getCodec().name().toLowerCase();
530+
}
531+
} else {
532+
compressionCodecOverride = options.compression().toLowerCase();
533+
}
538534
}
539-
OutputFile outputFile =
540-
tableIO.newOutputFile(Strings.replacePrefix(dstDataFile, "s3://", "s3a://"));
541-
// TODO: support transferTo below (note that compression, etc. might be different)
542-
// try (var d = outputFile.create()) {
543-
// try (var s = inputFile.newStream()) { s.transferTo(d); }
544-
// }
545-
Parquet.ReadBuilder readBuilder =
546-
Parquet.read(inputFile)
547-
.createReaderFunc(s -> GenericParquetReaders.buildReader(tableSchema, s))
548-
.project(tableSchema)
549-
.reuseContainers();
550535

551-
Parquet.WriteBuilder writeBuilder =
552-
Parquet.write(outputFile)
553-
.overwrite(dataFileNamingStrategy == DataFileNamingStrategy.Name.PRESERVE_ORIGINAL)
554-
.createWriterFunc(GenericParquetWriter::buildWriter)
555-
.metricsConfig(metricsConfig)
556-
.schema(tableSchema);
536+
if (partitionSpec.isPartitioned() && partitionKey == null) {
537+
return copyPartitionedAndSorted(
538+
file,
539+
tableSchema,
540+
partitionSpec,
541+
sortOrder,
542+
metricsConfig,
543+
tableIO,
544+
inputFile,
545+
dstDataFileSource,
546+
table.properties(),
547+
compressionCodecOverride);
548+
} else if (sortOrder.isSorted() && !sorted) {
549+
return Collections.singletonList(
550+
copySorted(
551+
file,
552+
dstDataFileSource.get(file),
553+
tableSchema,
554+
partitionSpec,
555+
sortOrder,
556+
metricsConfig,
557+
tableIO,
558+
inputFile,
559+
dataFileNamingStrategy,
560+
partitionKey,
561+
table.properties(),
562+
compressionCodecOverride));
563+
} else {
564+
// Table isn't partitioned or sorted. Copy as is.
565+
String dstDataFile;
566+
if (partitionSpec.isPartitioned() && partitionKey != null) {
567+
// File has inferred partition, use partition path
568+
dstDataFile = dstDataFileSource.get(partitionSpec, partitionKey, file);
569+
} else {
570+
dstDataFile = dstDataFileSource.get(file);
571+
}
572+
if (checkNotExists.apply(dstDataFile)) {
573+
return Collections.emptyList();
574+
}
575+
OutputFile outputFile =
576+
tableIO.newOutputFile(Strings.replacePrefix(dstDataFile, "s3://", "s3a://"));
577+
// TODO: support transferTo below (note that compression, etc. might be different)
578+
// try (var d = outputFile.create()) {
579+
// try (var s = inputFile.newStream()) { s.transferTo(d); }
580+
// }
581+
Parquet.ReadBuilder readBuilder =
582+
Parquet.read(inputFile)
583+
.createReaderFunc(s -> GenericParquetReaders.buildReader(tableSchema, s))
584+
.project(tableSchema)
585+
.reuseContainers();
586+
587+
Parquet.WriteBuilder writeBuilder =
588+
Parquet.write(outputFile)
589+
.setAll(table.properties())
590+
.overwrite(dataFileNamingStrategy == DataFileNamingStrategy.Name.PRESERVE_ORIGINAL)
591+
.createWriterFunc(GenericParquetWriter::buildWriter)
592+
.metricsConfig(metricsConfig)
593+
.schema(tableSchema);
594+
if (compressionCodecOverride != null) {
595+
596+
writeBuilder =
597+
writeBuilder.set(TableProperties.PARQUET_COMPRESSION, compressionCodecOverride);
598+
}
557599

558-
logger.info("{}: copying to {}", file, dstDataFile);
600+
logger.info("{}: copying to {}", file, dstDataFile);
559601

560-
try (CloseableIterable<Record> parquetReader = readBuilder.build()) {
561-
try (FileAppender<Record> writer = writeBuilder.build()) {
562-
writer.addAll(parquetReader);
563-
writer.close(); // for write.length()
564-
dataFileSizeInBytes = writer.length();
565-
metrics = writer.metrics();
602+
try (CloseableIterable<Record> parquetReader = readBuilder.build()) {
603+
try (FileAppender<Record> writer = writeBuilder.build()) {
604+
writer.addAll(parquetReader);
605+
writer.close(); // for write.length()
606+
dataFileSizeInBytes = writer.length();
607+
metrics = writer.metrics();
608+
}
566609
}
567-
}
568610

569-
dataFile = dstDataFile;
611+
dataFile = dstDataFile;
612+
}
570613
}
571614
logger.info(
572615
"{}: adding data file (copy took {}s)", file, (System.currentTimeMillis() - start) / 1000);
@@ -594,7 +637,9 @@ private static List<DataFile> copyPartitionedAndSorted(
594637
MetricsConfig metricsConfig,
595638
FileIO tableIO,
596639
InputFile inputFile,
597-
DataFileNamingStrategy dstDataFileSource)
640+
DataFileNamingStrategy dstDataFileSource,
641+
Map<String, String> tableProperties,
642+
@Nullable String compressionCodecOverride)
598643
throws IOException {
599644
logger.info("{}: partitioning{}", file, sortOrder.isSorted() ? "+sorting" : "");
600645

@@ -628,10 +673,15 @@ private static List<DataFile> copyPartitionedAndSorted(
628673

629674
Parquet.WriteBuilder writeBuilder =
630675
Parquet.write(outputFile)
676+
.setAll(tableProperties)
631677
.overwrite(true) // FIXME
632678
.createWriterFunc(GenericParquetWriter::buildWriter)
633679
.metricsConfig(metricsConfig)
634680
.schema(tableSchema);
681+
if (compressionCodecOverride != null) {
682+
writeBuilder =
683+
writeBuilder.set(TableProperties.PARQUET_COMPRESSION, compressionCodecOverride);
684+
}
635685

636686
try (FileAppender<Record> writer = writeBuilder.build()) {
637687
for (Record record : records) {
@@ -674,7 +724,9 @@ private static DataFile copySorted(
674724
FileIO tableIO,
675725
InputFile inputFile,
676726
DataFileNamingStrategy.Name dataFileNamingStrategy,
677-
PartitionKey partitionKey)
727+
PartitionKey partitionKey,
728+
Map<String, String> tableProperties,
729+
@Nullable String compressionCodecOverride)
678730
throws IOException {
679731
logger.info("{}: copying (sorted) to {}", file, dstDataFile);
680732

@@ -704,11 +756,16 @@ private static DataFile copySorted(
704756
// Write sorted records to outputFile
705757
Parquet.WriteBuilder writeBuilder =
706758
Parquet.write(outputFile)
759+
.setAll(tableProperties)
707760
.overwrite(
708761
dataFileNamingStrategy == DataFileNamingStrategy.Name.PRESERVE_ORIGINAL) // FIXME
709762
.createWriterFunc(GenericParquetWriter::buildWriter)
710763
.metricsConfig(metricsConfig)
711764
.schema(tableSchema);
765+
if (compressionCodecOverride != null) {
766+
writeBuilder =
767+
writeBuilder.set(TableProperties.PARQUET_COMPRESSION, compressionCodecOverride);
768+
}
712769

713770
long fileSizeInBytes;
714771
Metrics metrics;
@@ -799,7 +856,8 @@ public record Options(
799856
@Nullable String retryListFile,
800857
@Nullable List<Main.IcePartition> partitionList,
801858
@Nullable List<Main.IceSortOrder> sortOrderList,
802-
int threadCount) {
859+
int threadCount,
860+
@Nullable String compression) {
803861

804862
public static Builder builder() {
805863
return new Builder();
@@ -822,6 +880,7 @@ public static final class Builder {
822880
private List<Main.IcePartition> partitionList = List.of();
823881
private List<Main.IceSortOrder> sortOrderList = List.of();
824882
private int threadCount = Runtime.getRuntime().availableProcessors();
883+
private String compression;
825884

826885
private Builder() {}
827886

@@ -905,6 +964,11 @@ public Builder threadCount(int threadCount) {
905964
return this;
906965
}
907966

967+
public Builder compression(String compression) {
968+
this.compression = compression;
969+
return this;
970+
}
971+
908972
public Options build() {
909973
return new Options(
910974
dataFileNamingStrategy,
@@ -922,7 +986,8 @@ public Options build() {
922986
retryListFile,
923987
partitionList,
924988
sortOrderList,
925-
threadCount);
989+
threadCount,
990+
compression);
926991
}
927992
}
928993
}

0 commit comments

Comments
 (0)