CASSANALYTICS-6: User documentation

lukasz-antoniak · lukasz-antoniak · commit 3e59e642f3ec · 2026-02-10T15:54:06.000+01:00
diff --git a/build.gradle b/build.gradle
@@ -32,6 +32,8 @@ plugins {
 
   // Release Audit Tool (RAT) plugin for checking project licenses
   id("org.nosphere.apache.rat") version "0.8.1"
+
+  id 'org.asciidoctor.jvm.convert' version '3.3.2'
 }
 
 repositories {
diff --git a/docs/build.gradle b/docs/build.gradle
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+apply plugin: 'org.asciidoctor.jvm.convert'
+
+asciidoctor {
+    sourceDir = file("src")
+    outputDir = file("build")
+    attributes(
+            'project-version': project.version
+    )
+}
diff --git a/docs/src/user.adoc b/docs/src/user.adoc
@@ -0,0 +1,308 @@
+= Overview
+
+This document describes the configuration options available for the bulk reader and bulk writer components.
+
+== Cassandra Sidecar Configuration
+
+Analytics library uses Sidecar to interact with Cassandra cluster. Bulk reader and writer components share common
+Sidecar configuration properties.
+
+[cols="1,1,2"]
+|===
+|Property name|Default|Description
+
+|_sidecar_contact_points_
+|
+|Comma-separated list of Cassandra Sidecar contact points. IP addresses and FQDN domain names are supported,
+with an optional port number (e.g. `lcoalhost1,localhost2`, `127.0.0.1,127.0.0.2`, `127.0.0.1:9043,127.0.0.2:9043`)
+
+|_sidecar_port_
+|`9043`
+|Default port on which Cassandra Sidecar listens
+
+|_keystore_path_
+|
+|Path to keystore used to establish TLS connection with Cassandra Sidecar
+
+|_keystore_base64_encoded_
+|
+|Base64-encoded keystore used to establish TLS connection with Cassandra Sidecar
+
+|_keystore_password_
+|
+|Keystore password
+
+|_keystore_type_
+|`PKCS12`
+|Keystore type, `PKCS12` or `JKS`
+
+|_truststore_path_
+|
+|Path to truststore used to establish TLS connection with Cassandra Sidecar
+
+|_truststore_base64_encoded_
+|
+|Base64-encoded truststore used to establish TLS connection with Cassandra Sidecar
+
+|_truststore_password_
+|
+|Truststore password
+
+|_truststore_type_
+|`PKCS12`
+|Truststore type, `PKCS12` or `JKS`
+
+|_cassandra_role_
+|
+|Specific role that Sidecar shall use to authorize the request. For further details consult Sidecar documentation
+for `cassandra-auth-role` HTTP header
+
+|===
+
+== Bulk Reader
+
+This section describes configuration properties specific to the bulk reader.
+
+=== Cassandra Sidecar Configuration
+
+[cols="1,1,2"]
+|===
+|Property name|Default|Description
+
+|_defaultMillisToSleep_
+|`500`
+|Number of milliseconds to wait between retry attempts
+
+|_maxMillisToSleep_
+|`60000`
+|Maximum number of milliseconds to sleep between retries
+
+|_maxPoolSize_
+|`64`
+|Size of the Vert.x worker thread pool
+
+|_timeoutSeconds_
+|`600`
+|Request timeout, expressed in seconds
+
+|===
+
+=== Spark Reader Configuration
+
+[cols="1,1,2"]
+|===
+|Property name|Default|Description
+
+|_keyspace_
+|
+|Keyspace of a table to read
+
+|_table_
+|
+|Table to be read
+
+|_dc_
+|
+|Data center used when `LOCAL_*` consistency level is specified
+
+|_consistencyLevel_
+|`LOCAL_QUORUM`
+|Read consistency level
+
+|_snapshotName_
+|`sbr_\{uuid\}`
+|Name of a snapshot to use (for data consistency). By default, unique name is always generated
+
+|_createSnapshot_
+|`true`
+|Indicates whether a new snapshot should be created prior to performing the read operation
+
+|_clearSnapshotStrategy_
+|`OnCompletionOrTTL 2d`
+|Strategy of removing snapshot once read operation completes. This option is enabled always when _createSnapshot_
+flag is set to `true`. Value of _clearSnapshotStrategy_ must follow the format: `[strategy] [snapshotTTL]`. Supported
+strategies: `NoOp`, `OnCompletion`, `OnCompletionOrTTL`, `TTL`. Example configurations: `OnCompletionOrTTL 2d`,
+`TTL 2d`, `NoOp`, `OnCompletion`. TTL value has to match pattern: `\d+(d\|h\|m\|s)`
+
+|_bigNumberConfig_
+|
+a|Defines the output scale and precision of `decimal` and `varint` columns. Parameter value is a JSON string
+with the following structure:
+
+[source,json]
+----
+{
+  "columnName1" : {"bigDecimalPrecision": 10, "bigDecimalScale": 5},
+  "columnName2" : {"bigIntegerPrecision": 10, "bigIntegerScale": 5}
+}
+----
+
+|_lastModifiedColumnName_
+|
+|Name of the field to be appended to Spark RDD that represents last modification timestamp of each row
+
+|===
+
+=== Other Properties
+
+[cols="1,1,2"]
+|===
+|Property name|Default|Description
+
+|_defaultParallelism_
+|`1`
+|Value of Spark property `spark.default.parallelism`
+
+|_numCores_
+|`1`
+|Total number of cores used by all Spark executors
+
+|_maxBufferSizeBytes_
+|`6291456`
+a|Maximum amount of bytes per sstable file that may be downloaded and buffered in-memory. This parameter is
+global default and can be overridden per sstable file type. Effective defaults are:
+
+- `Data.db`: 6291456
+- `Index.db`: 131072
+- `Summary.db`: 262144
+- `Statistics.db`: 131072
+- `CompressionInfo.db`: 131072
+- `.log` (commit log): 65536
+- `Partitions.db`: 131072
+- `Rows.db`: 131072
+
+To override size for `Data.db`, use property `_maxBufferSizeBytes_Data.db_`.
+
+|_chunkBufferSizeBytes_
+|`4194304`
+a|Default chunk size (in bytes) that will be requested when fetching next portion of sstable file. This parameter is
+global default and can be overridden per sstable file type. Effective defaults are:
+
+- `Data.db`: 4194304
+- `Index.db`: 32768
+- `Summary.db`: 131072
+- `Statistics.db`: 65536
+- `CompressionInfo.db`: 65536
+- `.log` (commit log): 65536
+- `Partitions.db`: 4096
+- `Rows.db`: 4096
+
+To override size for `Data.db`, use property `_chunkBufferSizeBytes_Data.db_`.
+
+|_sizing_
+|`default`
+a|Determines how the number of CPU cores is selected during the read operation. Supported options:
+
+* `default`: static number of cores defined by _numCores_ parameter
+* `dynamic`: calculates number of cores dynamically based on table size. Improves cost efficiency for processing small
+tables (few GBs). Consult JavaDoc of `org.apache.cassandra.spark.data.DynamicSizing` for implementation details.
+Relevant configuration properties:
+    ** _maxPartitionSize_: maximum Spark partition size (in GiB)
+
+|_quote_identifiers_
+|`false`
+|When `true`, keyspace, table and column names are quoted
+
+|_sstable_start_timestamp_micros_ and _sstable_end_timestamp_micros_
+|
+|Define an inclusive time-range filter for sstable selection. Both timestamps are expressed in microseconds
+
+|===
+
+== Bulk Writer
+
+This section describes configuration properties specific to the bulk writer.
+
+=== Spark Writer Configuration
+
+[cols="1,1,2"]
+|===
+|Property name|Default|Description
+
+|_keyspace_
+|
+|Keyspace of a table to write
+
+|_table_
+|
+|Table to which rows are written or from which rows are removed depending on _write_mode_
+
+|_local_dc_
+|
+|Data center used when `LOCAL_*` consistency level is specified
+
+|_bulk_writer_cl_
+|`EACH_QUORUM`
+|Write consistency level
+
+|_write_mode_
+|`INSERT`
+|Determines write mode: `INSERT` or `DELETE_PARTITION`
+
+|_ttl_
+|
+|Time-to-live value applied to created records
+
+|_timestamp_
+|`NOW`
+|Mutation timestamp assigned to generated rows, expressed in microseconds
+
+|_skip_extended_verify_
+|`false`
+|Every imported sstable is verified for corruption during import process. This property allows to enable extended
+checking of all values in the new sstables
+
+|_quote_identifiers_
+|`false`
+|Option that specifies whether the identifiers (i.e. keyspace, table name, column names) should be quoted to
+support mixed case and reserved keyword names for these fields
+
+|_data_transport_
+|`DIRECT`
+a|Specifies data transport mode. Supported implementations:
+
+* `DIRECT`: Upload of generated sstables directly to Cassandra cluster via Sidecar
+* `S3_COMPAT`: Upload of generated sstables to remote S3-compliant storage
+
+|===
+
+=== S3 Upload Properties
+
+[cols="1,1,2"]
+|===
+|Property name|Default|Description
+
+|===
+
+=== Other Properties
+
+[cols="1,1,2"]
+|===
+|Property name|Default|Description
+
+|_number_splits_
+|`-1`
+|User defined number of token range splits. By default, library will dynamically calculate number of splits based
+on Spark properties `spark.default.parallelism`, `spark.executor.cores` and `spark.executor.instances`
+
+|_sstable_data_size_in_mib_
+|`160`
+|Maximum sstable size (in MiB)
+
+|_digest_
+|`XXHash32`
+|Digest algorithm used to compute when uploading sstables for checksum validation. Supported values: `XXHash32`, `MD5`
+
+|_job_timeout_seconds_
+|`-1`
+a|Specifies a timeout in seconds for bulk write jobs. Disabled by default. When configured, job exceeding
+the timeout is:
+
+* successful when the desired consistency level is achieved
+* failed otherwise
+
+|_job_id_
+|
+|User-defined identifier for the bulk write job
+
+|===
diff --git a/settings.gradle b/settings.gradle
@@ -50,4 +50,5 @@ include 'cassandra-analytics-cdc-codec'
 include 'analytics-sidecar-vertx-client-shaded'
 include 'analytics-sidecar-vertx-client'
 include 'analytics-sidecar-client'
-include 'analytics-sidecar-client-common'
+include 'analytics-sidecar-client-common'
+include 'docs'

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,8 @@ plugins {`
`32`	`32`
`33`	`33`	`// Release Audit Tool (RAT) plugin for checking project licenses`
`34`	`34`	`id("org.nosphere.apache.rat") version "0.8.1"`
	`35`	`+`
	`36`	`+ id 'org.asciidoctor.jvm.convert' version '3.3.2'`
`35`	`37`	`}`
`36`	`38`
`37`	`39`	`repositories {`