apache · samueldlightfoot · May 19, 2026
diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
@@ -693,6 +693,21 @@ commitlog_disk_access_mode: legacy
 # - direct: use direct I/O for compaction reads, bypassing the OS page cache
 # compaction_read_disk_access_mode: auto
 
+# Set the disk access mode for writing compressed SSTables during background operations
+# (compaction, streaming, scrub, cleanup, repair, etc.). The allowed values are:
+# - standard: use buffered I/O (default)
+# - direct: use direct I/O, bypassing the OS page cache
+# Note: Only applies to compressed tables. Uncompressed tables always use buffered I/O.
+# Note: Memtable flushes always use buffered I/O regardless of this setting, as flushed
+# data benefits from page cache for subsequent reads.
+# background_write_disk_access_mode: standard
+
+# Size of the in-memory staging buffer for Direct IO background writes. Trades off syscall
+# frequency against per-flush blocking latency on the compaction thread.
+# Aligned up to filesystem block size; auto-expands to fit a single compressed chunk + CRC
+# + one block when chunk_length exceeds this value.
+# direct_write_buffer_size: 1MiB
+
 # Compression to apply to SSTables as they flush for compressed tables.
 # Note that tables without compression enabled do not respect this flag.
 #

diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
@@ -362,6 +362,18 @@ public MemtableOptions()
 
     public DataStorageSpec.IntKibibytesBound compressed_read_ahead_buffer_size = new DataStorageSpec.IntKibibytesBound("256KiB");
 
+    // Direct IO for background SSTable writes (compaction, streaming, scrub, cleanup, etc.)
+    // When 'direct' is set, background writes bypass the OS page cache using O_DIRECT.
+    // Memtable flushes always use buffered I/O regardless of this setting.
+    // Default is 'standard' (buffered I/O) - users must opt-in to Direct IO
+    public DiskAccessMode background_write_disk_access_mode = DiskAccessMode.standard;
+
+    // Size of the in-memory staging buffer for Direct IO background writes. Trades off syscall
+    // frequency against per-flush blocking latency on the compaction thread.
+    // Aligned up to filesystem block size; auto-expands to fit a single compressed chunk + CRC
+    // + one block when chunk_length exceeds this value.
+    public DataStorageSpec.IntKibibytesBound direct_write_buffer_size = new DataStorageSpec.IntKibibytesBound("1MiB");
+
     // fraction of free disk space available for compaction after min free space is subtracted
     public volatile Double max_space_usable_for_compactions_in_percentage = .95;
 

diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -224,6 +224,8 @@ public class DatabaseDescriptor
 
     private static DiskAccessMode compactionReadDiskAccessMode;
 
+    private static DiskAccessMode backgroundWriteDiskAccessMode;
+
     private static AbstractCryptoProvider cryptoProvider;
     private static IAuthenticator authenticator;
     private static IAuthorizer authorizer;
@@ -897,6 +899,10 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m
         if (conf.hints_directory.equals(conf.saved_caches_directory))
             throw new ConfigurationException("saved_caches_directory must not be the same as the hints_directory", false);
 
+        initializeBackgroundWriteDiskAccessMode();
+        if (backgroundWriteDiskAccessMode != conf.background_write_disk_access_mode)
+            logger.info("background_write_disk_access_mode resolved to: {}", backgroundWriteDiskAccessMode);
+
         if (conf.memtable_flush_writers == 0)
         {
             conf.memtable_flush_writers = conf.data_file_directories.length == 1 ? 2 : 1;
@@ -3406,6 +3412,79 @@ public static void initializeCommitLogDiskAccessMode()
         commitLogWriteDiskAccessMode = accessModeDirectIoPair.left;
     }
 
+    public static DiskAccessMode getBackgroundWriteDiskAccessMode()
+    {
+        return backgroundWriteDiskAccessMode;
+    }
+
+    @VisibleForTesting
+    public static void setBackgroundWriteDiskAccessMode(DiskAccessMode diskAccessMode)
+    {
+        backgroundWriteDiskAccessMode = diskAccessMode;
+        conf.background_write_disk_access_mode = diskAccessMode;
+    }
+
+    public static DataStorageSpec.IntKibibytesBound getDirectWriteBufferSize()
+    {
+        return conf.direct_write_buffer_size;
+    }
+
+    @VisibleForTesting
+    public static void initializeBackgroundWriteDiskAccessMode()
+    {
+        DiskAccessMode providedMode = conf.background_write_disk_access_mode;
+
+        if (providedMode == DiskAccessMode.auto)
+        {
+            providedMode = DiskAccessMode.standard;
+        }
+
+        if (providedMode == DiskAccessMode.direct)
+        {
+            // DataStorageSpec already rejects negatives at parse time; zero is the remaining
+            // nonsense value. The writer's Math.max would silently coerce it to minRequiredSize,
+            // which masks a likely operator mistake — fail fast instead.
+            if (conf.direct_write_buffer_size.toBytes() <= 0)
+                throw new ConfigurationException("direct_write_buffer_size must be > 0 when background_write_disk_access_mode is 'direct'. " +
+                                                 "Got: " + conf.direct_write_buffer_size, false);
+
+            if (!toolInitialized)
+            {
+                List<String> unsupportedLocations = new ArrayList<>();
+
+                for (String dataDir : conf.data_file_directories)
+                {
+                    try
+                    {
+                        File dataDirFile = new File(dataDir);
+                        PathUtils.createDirectoriesIfNotExists(dataDirFile.toPath());
+
+                        if (!FileUtils.isDirectIOSupported(dataDirFile))
+                        {
+                            unsupportedLocations.add(dataDir);
+                        }
+                    }
+                    catch (RuntimeException e)
+                    {
+                        logger.warn("Unable to determine Direct IO support for data directory {}: {}", dataDir, e.getMessage());
+                        unsupportedLocations.add(dataDir + " (check failed: " + e.getMessage() + ")");
+                    }
+                }
+
+                if (!unsupportedLocations.isEmpty())
+                {
+                    throw new ConfigurationException(
+                        String.format("background_write_disk_access_mode is set to 'direct', but the following data directories " +
+                                      "do not support Direct I/O: %s. Either change background_write_disk_access_mode to 'standard' " +
+                                      "in cassandra.yaml, or ensure all data directories are on filesystems that support Direct I/O.",
+                                      unsupportedLocations), false);
+                }
+            }
+        }
+
+        backgroundWriteDiskAccessMode = providedMode;
+    }
+
     public static String getSavedCachesLocation()
     {
         return conf.saved_caches_directory;

diff --git a/src/java/org/apache/cassandra/io/DirectIoSupport.java b/src/java/org/apache/cassandra/io/DirectIoSupport.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io;
+
+/**
+ * Classifies an operation's eligibility for a direct-IO (O_DIRECT) data path, encoding both
+ * the answer and the rationale class. Consumers maintain their own per-operation classification
+ * and apply this alongside their own gates (e.g. compression, configuration mode);
+ * {@link #SUPPORTED} is necessary but not sufficient.
+ */
+public enum DirectIoSupport
+{
+    /**
+     * Eligible for the direct-IO data path.
+     * */
+    SUPPORTED,
+
+    /**
+     * The direct-IO path is mechanically incompatible with this operation. Removing this
+     * exclusion requires code changes, not policy.
+     */
+    UNSUPPORTED_CORRECTNESS,
+
+    /**
+     * Direct IO would work but is deliberately disabled for performance or cache-residency
+     * reasons. Removing this exclusion requires re-evaluating the policy, not code changes.
+     */
+    UNSUPPORTED_POLICY;
+
+    public boolean isSupported()
+    {
+        return this == SUPPORTED;
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java b/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java
@@ -22,6 +22,7 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
+import java.nio.file.OpenOption;
 import java.util.Optional;
 import java.util.zip.CRC32;
 
@@ -50,66 +51,76 @@ public class CompressedSequentialWriter extends SequentialWriter
 
     // holds offset in the file where current chunk should be written
     // changed only by flush() method where data buffer gets compressed and stored to the file
-    private long chunkOffset = 0;
+    protected long chunkOffset = 0;
 
     // index file writer (random I/O)
-    private final CompressionMetadata.Writer metadataWriter;
+    protected final CompressionMetadata.Writer metadataWriter;
     private final ICompressor compressor;
 
     // used to store compressed data
     private ByteBuffer compressed;
 
     // holds a number of already written chunks
-    private int chunkCount = 0;
+    protected int chunkCount = 0;
 
-    private long uncompressedSize = 0, compressedSize = 0;
+    protected long uncompressedSize = 0;
+    protected long compressedSize = 0;
 
-    private final MetadataCollector sstableMetadataCollector;
+    protected final MetadataCollector sstableMetadataCollector;
     private final CompressionDictionaryManager compressionDictionaryManager;
 
     private final ByteBuffer crcCheckBuffer = ByteBuffer.allocate(4);
-    private final Optional<File> digestFile;
+    protected final Optional<File> digestFile;
 
     private final int maxCompressedLength;
     private final boolean isDictionaryEnabled;
 
+    private static ByteBuffer allocateBuffer(CompressionParams parameters)
+    {
+        return parameters.getSstableCompressor().preferredBufferType().allocate(parameters.chunkLength());
+    }
+
+    private static SequentialWriterOption buildOption(SequentialWriterOption option, CompressionParams parameters)
+    {
+        return SequentialWriterOption.newBuilder()
+                                     .bufferSize(parameters.chunkLength())
+                                     .bufferType(parameters.getSstableCompressor().preferredBufferType())
+                                     .finishOnClose(option.finishOnClose())
+                                     .build();
+    }
+
     public CompressedSequentialWriter(File file,
                                       File offsetsFile,
-                                      File digestFile,
+                                      @Nullable File digestFile,
                                       SequentialWriterOption option,
                                       CompressionParams parameters,
                                       MetadataCollector sstableMetadataCollector)
     {
         this(file, offsetsFile, digestFile, option, parameters, sstableMetadataCollector, null);
     }
 
-
     /**
-     * Create CompressedSequentialWriter without digest file.
+     * Create CompressedSequentialWriter with optional compression dictionary and channel options.
      *
      * @param file File to write
      * @param offsetsFile File to write compression metadata
-     * @param digestFile File to write digest
+     * @param digestFile File to write digest, or null if not needed
      * @param option Write option (buffer size and type will be set the same as compression params)
      * @param parameters Compression parameters
      * @param sstableMetadataCollector Metadata collector
      * @param compressionDictionaryManager manages compression dictionary; null if absent
+     * @param extraOpenOptions additional options to pass to FileChannel.open (e.g., ExtendedOpenOption.DIRECT)
      */
     public CompressedSequentialWriter(File file,
                                       File offsetsFile,
-                                      File digestFile,
+                                      @Nullable File digestFile,
                                       SequentialWriterOption option,
                                       CompressionParams parameters,
                                       MetadataCollector sstableMetadataCollector,
-                                      @Nullable CompressionDictionaryManager compressionDictionaryManager)
+                                      @Nullable CompressionDictionaryManager compressionDictionaryManager,
+                                      OpenOption... extraOpenOptions)
     {
-        super(file, SequentialWriterOption.newBuilder()
-                            .bufferSize(option.bufferSize())
-                            .bufferType(option.bufferType())
-                            .bufferSize(parameters.chunkLength())
-                            .bufferType(parameters.getSstableCompressor().preferredBufferType())
-                            .finishOnClose(option.finishOnClose())
-                            .build());
+        super(file, allocateBuffer(parameters), buildOption(option, parameters), true, extraOpenOptions);
         ICompressor compressor = parameters.getSstableCompressor();
         this.digestFile = Optional.ofNullable(digestFile);
 
@@ -142,7 +153,7 @@ public CompressedSequentialWriter(File file,
         metadataWriter = CompressionMetadata.Writer.open(parameters, offsetsFile, compressionDictionary);
 
         this.sstableMetadataCollector = sstableMetadataCollector;
-        crcMetadata = new ChecksumWriter(new DataOutputStream(Channels.newOutputStream(channel)));
+        crcMetadata = new ChecksumWriter(new DataOutputStream(Channels.newOutputStream(this.channel)));
     }
 
     @Override
@@ -178,7 +189,9 @@ public void flush()
     @Override
     protected void flushData()
     {
-        seekToChunkStart(); // why is this necessary? seems like it should always be at chunk start in normal operation
+        // resetAndTruncate leaves fchannel.position() past EOF after its verification reads + truncate;
+        // re-seek so the next chunk lands at chunkOffset. No-op under linear writes.
+        seekToChunkStart();
 
         try
         {
@@ -216,32 +229,36 @@ protected void flushData()
         }
         compressedSize += compressedLength;
 
+        // write an offset of the newly written chunk to the index file
+        metadataWriter.addOffset(chunkOffset);
+        chunkCount++;
+
+        // write out the compressed data and checksum
+        toWrite.flip();
+        writeChunk(toWrite);
+        lastFlushOffset = uncompressedSize;
+
+        if (toWrite == buffer)
+            buffer.position(uncompressedLength);
+
+        // next chunk should be written right after current + length of the checksum (int)
+        chunkOffset += compressedLength + 4;
+        if (runPostFlush != null)
+            runPostFlush.accept(getLastFlushOffset());
+    }
+
+    protected void writeChunk(ByteBuffer toWrite)
+    {
         try
         {
-            // write an offset of the newly written chunk to the index file
-            metadataWriter.addOffset(chunkOffset);
-            chunkCount++;
-
-            // write out the compressed data
-            toWrite.flip();
             channel.write(toWrite);
-
-            // write corresponding checksum
             toWrite.rewind();
             crcMetadata.appendDirect(toWrite, true);
-            lastFlushOffset = uncompressedSize;
         }
         catch (IOException e)
         {
             throw new FSWriteError(e, getPath());
         }
-        if (toWrite == buffer)
-            buffer.position(uncompressedLength);
-
-        // next chunk should be written right after current + length of the checksum (int)
-        chunkOffset += compressedLength + 4;
-        if (runPostFlush != null)
-            runPostFlush.accept(getLastFlushOffset());
     }
 
     public CompressionMetadata open(long overrideLength)
@@ -358,10 +375,16 @@ private void truncate(long toFileSize, long toBufferOffset)
         }
     }
 
+    protected void writeDigestFile()
+    {
+        digestFile.ifPresent(crcMetadata::writeFullChecksum);
+    }
+
     /**
      * Seek to the offset where next compressed data chunk should be stored.
+     * Subclasses may override if they manage their own channel.
      */
-    private void seekToChunkStart()
+    protected void seekToChunkStart()
     {
         if (getOnDiskFilePointer() != chunkOffset)
         {
@@ -429,7 +452,7 @@ protected Throwable doAbort(Throwable accumulate)
         protected void doPrepare()
         {
             syncInternal();
-            digestFile.ifPresent(crcMetadata::writeFullChecksum);
+            writeDigestFile();
             sstableMetadataCollector.addCompressionRatio(compressedSize, uncompressedSize);
             metadataWriter.finalizeLength(current(), chunkCount).prepareToCommit();
         }