Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions conf/cassandra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,21 @@ commitlog_disk_access_mode: legacy
# - direct: use direct I/O for compaction reads, bypassing the OS page cache
# compaction_read_disk_access_mode: auto

# Set the disk access mode for writing compressed SSTables during background operations
# (compaction, streaming, scrub, cleanup, repair, etc.). The allowed values are:
# - standard: use buffered I/O (default)
# - direct: use direct I/O, bypassing the OS page cache
# Note: Only applies to compressed tables. Uncompressed tables always use buffered I/O.
# Note: Memtable flushes always use buffered I/O regardless of this setting, as flushed
# data benefits from page cache for subsequent reads.
# background_write_disk_access_mode: standard

# Size of the in-memory staging buffer for Direct IO background writes. Trades off syscall
# frequency against per-flush blocking latency on the compaction thread.
# Aligned up to filesystem block size; auto-expands to fit a single compressed chunk + CRC
# + one block when chunk_length exceeds this value.
# direct_write_buffer_size: 1MiB

# Compression to apply to SSTables as they flush for compressed tables.
# Note that tables without compression enabled do not respect this flag.
#
Expand Down
12 changes: 12 additions & 0 deletions src/java/org/apache/cassandra/config/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,18 @@ public MemtableOptions()

public DataStorageSpec.IntKibibytesBound compressed_read_ahead_buffer_size = new DataStorageSpec.IntKibibytesBound("256KiB");

// Direct IO for background SSTable writes (compaction, streaming, scrub, cleanup, etc.)
// When 'direct' is set, background writes bypass the OS page cache using O_DIRECT.
// Memtable flushes always use buffered I/O regardless of this setting.
// Default is 'standard' (buffered I/O) - users must opt-in to Direct IO
public DiskAccessMode background_write_disk_access_mode = DiskAccessMode.standard;

// Size of the in-memory staging buffer for Direct IO background writes. Trades off syscall
// frequency against per-flush blocking latency on the compaction thread.
// Aligned up to filesystem block size; auto-expands to fit a single compressed chunk + CRC
// + one block when chunk_length exceeds this value.
public DataStorageSpec.IntKibibytesBound direct_write_buffer_size = new DataStorageSpec.IntKibibytesBound("1MiB");

// fraction of free disk space available for compaction after min free space is subtracted
public volatile Double max_space_usable_for_compactions_in_percentage = .95;

Expand Down
79 changes: 79 additions & 0 deletions src/java/org/apache/cassandra/config/DatabaseDescriptor.java
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ public class DatabaseDescriptor

private static DiskAccessMode compactionReadDiskAccessMode;

private static DiskAccessMode backgroundWriteDiskAccessMode;

private static AbstractCryptoProvider cryptoProvider;
private static IAuthenticator authenticator;
private static IAuthorizer authorizer;
Expand Down Expand Up @@ -897,6 +899,10 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m
if (conf.hints_directory.equals(conf.saved_caches_directory))
throw new ConfigurationException("saved_caches_directory must not be the same as the hints_directory", false);

initializeBackgroundWriteDiskAccessMode();
if (backgroundWriteDiskAccessMode != conf.background_write_disk_access_mode)
logger.info("background_write_disk_access_mode resolved to: {}", backgroundWriteDiskAccessMode);

if (conf.memtable_flush_writers == 0)
{
conf.memtable_flush_writers = conf.data_file_directories.length == 1 ? 2 : 1;
Expand Down Expand Up @@ -3406,6 +3412,79 @@ public static void initializeCommitLogDiskAccessMode()
commitLogWriteDiskAccessMode = accessModeDirectIoPair.left;
}

public static DiskAccessMode getBackgroundWriteDiskAccessMode()
{
return backgroundWriteDiskAccessMode;
}

@VisibleForTesting
public static void setBackgroundWriteDiskAccessMode(DiskAccessMode diskAccessMode)
{
backgroundWriteDiskAccessMode = diskAccessMode;
conf.background_write_disk_access_mode = diskAccessMode;
}

public static DataStorageSpec.IntKibibytesBound getDirectWriteBufferSize()
{
return conf.direct_write_buffer_size;
}

@VisibleForTesting
public static void initializeBackgroundWriteDiskAccessMode()
{
DiskAccessMode providedMode = conf.background_write_disk_access_mode;

if (providedMode == DiskAccessMode.auto)
{
providedMode = DiskAccessMode.standard;
}

if (providedMode == DiskAccessMode.direct)
{
// DataStorageSpec already rejects negatives at parse time; zero is the remaining
// nonsense value. The writer's Math.max would silently coerce it to minRequiredSize,
// which masks a likely operator mistake — fail fast instead.
if (conf.direct_write_buffer_size.toBytes() <= 0)
throw new ConfigurationException("direct_write_buffer_size must be > 0 when background_write_disk_access_mode is 'direct'. " +
"Got: " + conf.direct_write_buffer_size, false);

if (!toolInitialized)
{
List<String> unsupportedLocations = new ArrayList<>();

for (String dataDir : conf.data_file_directories)
{
try
{
File dataDirFile = new File(dataDir);
PathUtils.createDirectoriesIfNotExists(dataDirFile.toPath());

if (!FileUtils.isDirectIOSupported(dataDirFile))
{
unsupportedLocations.add(dataDir);
}
}
catch (RuntimeException e)
{
logger.warn("Unable to determine Direct IO support for data directory {}: {}", dataDir, e.getMessage());
unsupportedLocations.add(dataDir + " (check failed: " + e.getMessage() + ")");
}
}

if (!unsupportedLocations.isEmpty())
{
throw new ConfigurationException(
String.format("background_write_disk_access_mode is set to 'direct', but the following data directories " +
"do not support Direct I/O: %s. Either change background_write_disk_access_mode to 'standard' " +
"in cassandra.yaml, or ensure all data directories are on filesystems that support Direct I/O.",
unsupportedLocations), false);
}
}
}

backgroundWriteDiskAccessMode = providedMode;
}

public static String getSavedCachesLocation()
{
return conf.saved_caches_directory;
Expand Down
49 changes: 49 additions & 0 deletions src/java/org/apache/cassandra/io/DirectIoSupport.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.io;

/**
* Classifies an operation's eligibility for a direct-IO (O_DIRECT) data path, encoding both
* the answer and the rationale class. Consumers maintain their own per-operation classification
* and apply this alongside their own gates (e.g. compression, configuration mode);
* {@link #SUPPORTED} is necessary but not sufficient.
*/
public enum DirectIoSupport
{
/**
* Eligible for the direct-IO data path.
* */
SUPPORTED,

/**
* The direct-IO path is mechanically incompatible with this operation. Removing this
* exclusion requires code changes, not policy.
*/
UNSUPPORTED_CORRECTNESS,

/**
* Direct IO would work but is deliberately disabled for performance or cache-residency
* reasons. Removing this exclusion requires re-evaluating the policy, not code changes.
*/
UNSUPPORTED_POLICY;

public boolean isSupported()
{
return this == SUPPORTED;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.file.OpenOption;
import java.util.Optional;
import java.util.zip.CRC32;

Expand Down Expand Up @@ -50,66 +51,76 @@ public class CompressedSequentialWriter extends SequentialWriter

// holds offset in the file where current chunk should be written
// changed only by flush() method where data buffer gets compressed and stored to the file
private long chunkOffset = 0;
protected long chunkOffset = 0;

// index file writer (random I/O)
private final CompressionMetadata.Writer metadataWriter;
protected final CompressionMetadata.Writer metadataWriter;
private final ICompressor compressor;

// used to store compressed data
private ByteBuffer compressed;

// holds a number of already written chunks
private int chunkCount = 0;
protected int chunkCount = 0;

private long uncompressedSize = 0, compressedSize = 0;
protected long uncompressedSize = 0;
protected long compressedSize = 0;

private final MetadataCollector sstableMetadataCollector;
protected final MetadataCollector sstableMetadataCollector;
private final CompressionDictionaryManager compressionDictionaryManager;

private final ByteBuffer crcCheckBuffer = ByteBuffer.allocate(4);
private final Optional<File> digestFile;
protected final Optional<File> digestFile;

private final int maxCompressedLength;
private final boolean isDictionaryEnabled;

private static ByteBuffer allocateBuffer(CompressionParams parameters)
{
return parameters.getSstableCompressor().preferredBufferType().allocate(parameters.chunkLength());
}

private static SequentialWriterOption buildOption(SequentialWriterOption option, CompressionParams parameters)
{
return SequentialWriterOption.newBuilder()
.bufferSize(parameters.chunkLength())
.bufferType(parameters.getSstableCompressor().preferredBufferType())
.finishOnClose(option.finishOnClose())
.build();
}

public CompressedSequentialWriter(File file,
File offsetsFile,
File digestFile,
@Nullable File digestFile,
SequentialWriterOption option,
CompressionParams parameters,
MetadataCollector sstableMetadataCollector)
{
this(file, offsetsFile, digestFile, option, parameters, sstableMetadataCollector, null);
}


/**
* Create CompressedSequentialWriter without digest file.
* Create CompressedSequentialWriter with optional compression dictionary and channel options.
*
* @param file File to write
* @param offsetsFile File to write compression metadata
* @param digestFile File to write digest
* @param digestFile File to write digest, or null if not needed
* @param option Write option (buffer size and type will be set the same as compression params)
* @param parameters Compression parameters
* @param sstableMetadataCollector Metadata collector
* @param compressionDictionaryManager manages compression dictionary; null if absent
* @param extraOpenOptions additional options to pass to FileChannel.open (e.g., ExtendedOpenOption.DIRECT)
*/
public CompressedSequentialWriter(File file,
File offsetsFile,
File digestFile,
@Nullable File digestFile,
SequentialWriterOption option,
CompressionParams parameters,
MetadataCollector sstableMetadataCollector,
@Nullable CompressionDictionaryManager compressionDictionaryManager)
@Nullable CompressionDictionaryManager compressionDictionaryManager,
OpenOption... extraOpenOptions)
{
super(file, SequentialWriterOption.newBuilder()
.bufferSize(option.bufferSize())
.bufferType(option.bufferType())
.bufferSize(parameters.chunkLength())
.bufferType(parameters.getSstableCompressor().preferredBufferType())
.finishOnClose(option.finishOnClose())
.build());
super(file, allocateBuffer(parameters), buildOption(option, parameters), true, extraOpenOptions);
ICompressor compressor = parameters.getSstableCompressor();
this.digestFile = Optional.ofNullable(digestFile);

Expand Down Expand Up @@ -142,7 +153,7 @@ public CompressedSequentialWriter(File file,
metadataWriter = CompressionMetadata.Writer.open(parameters, offsetsFile, compressionDictionary);

this.sstableMetadataCollector = sstableMetadataCollector;
crcMetadata = new ChecksumWriter(new DataOutputStream(Channels.newOutputStream(channel)));
crcMetadata = new ChecksumWriter(new DataOutputStream(Channels.newOutputStream(this.channel)));
}

@Override
Expand Down Expand Up @@ -178,7 +189,9 @@ public void flush()
@Override
protected void flushData()
{
seekToChunkStart(); // why is this necessary? seems like it should always be at chunk start in normal operation
// resetAndTruncate leaves fchannel.position() past EOF after its verification reads + truncate;
// re-seek so the next chunk lands at chunkOffset. No-op under linear writes.
seekToChunkStart();

try
{
Expand Down Expand Up @@ -216,32 +229,36 @@ protected void flushData()
}
compressedSize += compressedLength;

// write an offset of the newly written chunk to the index file
metadataWriter.addOffset(chunkOffset);
chunkCount++;

// write out the compressed data and checksum
toWrite.flip();
writeChunk(toWrite);
lastFlushOffset = uncompressedSize;

if (toWrite == buffer)
buffer.position(uncompressedLength);

// next chunk should be written right after current + length of the checksum (int)
chunkOffset += compressedLength + 4;
if (runPostFlush != null)
runPostFlush.accept(getLastFlushOffset());
}

protected void writeChunk(ByteBuffer toWrite)
{
try
{
// write an offset of the newly written chunk to the index file
metadataWriter.addOffset(chunkOffset);
chunkCount++;

// write out the compressed data
toWrite.flip();
channel.write(toWrite);

// write corresponding checksum
toWrite.rewind();
crcMetadata.appendDirect(toWrite, true);
lastFlushOffset = uncompressedSize;
}
catch (IOException e)
{
throw new FSWriteError(e, getPath());
}
if (toWrite == buffer)
buffer.position(uncompressedLength);

// next chunk should be written right after current + length of the checksum (int)
chunkOffset += compressedLength + 4;
if (runPostFlush != null)
runPostFlush.accept(getLastFlushOffset());
}

public CompressionMetadata open(long overrideLength)
Expand Down Expand Up @@ -358,10 +375,16 @@ private void truncate(long toFileSize, long toBufferOffset)
}
}

protected void writeDigestFile()
{
digestFile.ifPresent(crcMetadata::writeFullChecksum);
}

/**
* Seek to the offset where next compressed data chunk should be stored.
* Subclasses may override if they manage their own channel.
*/
private void seekToChunkStart()
protected void seekToChunkStart()
{
if (getOnDiskFilePointer() != chunkOffset)
{
Expand Down Expand Up @@ -429,7 +452,7 @@ protected Throwable doAbort(Throwable accumulate)
protected void doPrepare()
{
syncInternal();
digestFile.ifPresent(crcMetadata::writeFullChecksum);
writeDigestFile();
sstableMetadataCollector.addCompressionRatio(compressedSize, uncompressedSize);
metadataWriter.finalizeLength(current(), chunkCount).prepareToCommit();
}
Expand Down
Loading