ClickCannon/example.yaml at main · ClickHouse/ClickCannon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
######################################################################################
#                             ClickCannon example config                                #
# This is less of an example and more of a documentation of all the current options. #
######################################################################################

app:
  # A helpful name for identifying multiple runs from this config
  name: "insert traces 2 GiB/s"
  log_level: debug
  log_to_console: true
  log_to_file: false
  # Determines the data type being read/inserted for the test. One of: logs, traces
  data_type: traces
  # A seed for the test. Will be used with random number generation for reproducible tests.
  seed: "some string"

pprof:
  # Address to listen on for the pprof HTTP endpoint.
  # Leave empty (or omit this section) to disable pprof.
  # Bind to localhost and access via SSH tunnel:
  #   ssh -L 6060:localhost:6060 user@server
  # Then profile remotely:
  #   go tool pprof http://localhost:6060/debug/pprof/heap
  #   go tool pprof http://localhost:6060/debug/pprof/profile    # 30s CPU profile
  #   go tool pprof http://localhost:6060/debug/pprof/goroutine
  # Or download a snapshot to inspect offline:
  #   curl -s http://localhost:6060/debug/pprof/heap > heap.prof && go tool pprof heap.prof
  address: ""

# Generate mode: synthetic data generation (mutually exclusive with disk)
# Produces OTel data directly from a code-defined generator profile.
generate:
  enabled: false
  # How many generator threads
  threads: 2
  # How many rows to generate per block before pushing to the insert queue
  rows_per_block: 10000
  # Rate limit in rows per second across all threads. 0 = unlimited.
  rows_per_second: 10000
  # Block reuse settings (same concept as disk.reuse_blocks)
  reuse_blocks: true
  block_retirement_uses: 50

  # Name of a code-defined profile registered in internal/generate.
  # Built-in: otel_demo. Defaults to otel_demo if omitted.
  # To add your own, drop a profile_<name>.go in internal/generate that calls
  # generate.RegisterProfile("<name>", ...) from init() and reference it here.
  # profile: otel_demo

  # Trace-specific settings (only used when data_type: traces)
  traces:
    # Number of spans per trace (uniform random between min and max)
    spans_per_trace_min: 3
    spans_per_trace_max: 12
    # Maximum depth of trace tree
    max_depth: 5
    # Span duration range in microseconds
    duration_min_us: 1000
    duration_max_us: 5000000

# Disk mode: replay pre-exported .native/.native.zst files (mutually exclusive with generate)
disk:
  enabled: true
  threads: 4
  # Path to logs/traces directories. Files must be named .native or .native.zst
  logs_path: log_data
  traces_path: trace_data
  # Set to true if the logs files on disk were exported with TimestampTime present.
  has_timestamp_time: false
  # Blocks can be reused to reduce garbage in the program and improve throughput stability, but
  # ch-go has a bug where some columns aren't properly reset. I've hacked around this for now and so
  # you shouldn't notice it, but maybe I missed a detail when querying the data back.
  # Set this to false if you notice the inserted data has maps that have mismatched key/value pairs, or empty columns.
  reuse_blocks: true
  # After how many uses a reused block is retired and replaced with a fresh allocation.
  # Column slice backing arrays grow over time as blocks are filled and reset repeatedly.
  # Retiring blocks periodically bounds this memory growth without sacrificing the
  # throughput stability benefits of block reuse.
  # Set to 0 to disable retirement (blocks live for the program's lifetime).
  # Only applies when reuse_blocks is true.
  block_retirement_uses: 100
  # Should the file queue be looped? (Consider the shift_timestamp setting. May cause uneven distribution of data.)
  # Also consider the thread count.
  # If there are more threads than files, the looped files will immediately go in the queue.
  loop: false
  # The speed limit for uncompressed data across all threads, in MiB
  mb_per_second_limit: 2048
  # Shifts the timestamp of the data when reading from disk.
  # Options: <empty>, none, date, all, now.
  # Empty/"none" will do nothing and replay the data as it is written on disk. Event times will be unchanged.
  # "date" will replace only the date component with the current date.
  #        The event timestamp will keep its time component, but with the current date.
  #        This may cause issues if your data set goes across multiple days.
  # "all" will replay the data with its original timings by measuring the difference between the
  #       event time and the first timestamp in the first file loaded. This difference is then added to the
  #       program start time.
  #       It's replaying the original data but with the first event time being the program start time.
  #       If looping is enabled, the program will track the range of time that the dataset covers and account
  #       for its duration between loops.
  #       There may be some overlap where the first files end and the looped file queue starts.
  # "now" will simply set the event time to the current timestamp at the time the row is processed.
  #       This may lead to unrealistic time distributions where an entire block of rows share the same timestamp.
  shift_timestamp: all

insert:
  enabled: true
  threads: 4
  # How many rows per INSERT command. Blocks will be streamed until this limit, then a new INSERT will start.
  # After this limit is reached, the current block will finish sending and then end the INSERT, therefore it's not
  # an exact limit.
  batch_size: 100000
  # How many batches a worker sends before retiring and reconnecting. The ch-go encoder
  # accumulates buffer allocations over time; retiring workers periodically bounds this growth.
  # Workers are staggered across the window by ID so they don't all restart simultaneously
  # (e.g. with 4 workers and retirement=100, they retire at absolute batch 100, 125, 150, 175 respectively).
  # Set to 0 to disable. This value should be based on your configured throughput/batches estimate.
  worker_retirement_batches: 0
  # When enabled, connections are balanced evenly across nodes in the cluster.
  # A brute force method is used, which may spam connections until it gets the correct IP.
  balance_nodes: false
  clickhouse:
    address: localhost:9000
    secure: false
    user: default
    password:
    # One of: none, lz4, zstd, lz4hc
    # This will affect Insert Bytes metric
    compression:
    # Database and tables to insert into
    database: otel
    logs_table: otel_logs
    traces_table: otel_traces


# Settings related to the program's read/insert metrics
metrics:
  enabled: true
  # ClickHouse DSN URL, same format as OTel exporter.
  clickhouse_dsn: tcp://localhost:9000

  # Database and tables for capturing performance metrics.
  database: clickcannon
  run_table: runs
  metrics_table: perf
  # Controls whether the metrics database/tables are created on startup
  create_schema: true
  # Optional key-value attributes attached to the run record and every metric point.
  # Useful for tagging runs by environment, team, hardware, etc.
  attributes:
    env: "dev"
    foo: "bar"

# Settings related to user/query benchmarking
user:
  enabled: true
  # Optional. How long the user simulation runs.
  duration: 1h
  # Number of concurrent simulated users
  threads: 10
  # How long it takes to start all users
  ramp_duration: 30s
  # ClickHouse DSN URL, same format as OTel exporter.
  clickhouse_dsn: tcp://localhost:9000
  # How many connections each thread will make
  connections_per_thread: 2
  # The database parameter, available in the SQL as {database:Identifier}
  database: otel
  # The table parameter, available in the SQL as {table:Identifier}
  table: otel_traces

  # When does your dataset start? (only required for time_anchor=dataset_random)
  dataset_unix_start: 1758585600
  # When does your dataset end? (only required for time_anchor=dataset_end|dataset_random)
  dataset_unix_end: 1758631499

  # An array of user workflows. For now, we only use 1 at a time.
  workflows:
      # Workflow type: must be "queries", in the future I will re-add the HAR file replay.
    - type: queries
      # A name for this workflow, stored in performance metrics metadata
      name: "Example workload"
      # Should queries be run randomly, or sequentially?
      random: false
      # Time to wait between queries. Sampled uniformly random from [min, max]
      think_time:
        min: 1s
        max: 5s

      # The time anchor controls what "now" means when computing time ranges.
      # All time ranges operate as a "lookback", so it's [anchor - range, anchor]
      #   now:            wall clock — use for live/realtime datasets
      #   dataset_end:    fixed anchor at dataset_end — use for historical datasets
      #   dataset_random: random point within [dataset_start, dataset_end] per query,
      #                   maximizes cache thrashing on fixed datasets.
      time_anchor: now

      # Applied to all queries unless overridden. Set type=none on a query to opt out.
      default_time_range:
        type: uniform # one of: none, fixed, uniform, exponential, log_normal
        # round is optional, rounds the sampled start/end to the nearest interval.
        # Make sure this fits your range. This is available for all time_range types.
        round: 1m
        # These options change depending on type, see example queries below.
        min: 15m
        max: 4h

      # ClickHouse settings applied to all queries (including preflight) in this workflow.
      # These can be overridden per-query
      default_settings:
        max_execution_time: "60"

      # Controls how often the default_time_range is resampled.
      #   per_query: new time range on every query execution
      #   per_loop:  sample once at the start of each pass through the query list (when random=false)
      time_range_cadence: per_query

      # Optional: constant variables available to all queries and preflight queries in this workflow.
      # Available in SQL as {VarName:String}. Query-level vars override these.
      vars:
        Example: "value"
        Env: "prod"

      # Controls when workload-level preflight_queries are (re-)executed.
      #   once:      run once at startup, result cached for the entire test (default)
      #   per_loop:  re-run at the start of each pass through the query list (random=false only)
      #   per_query: re-run before every query execution
      preflight_cadence: once

      # Optional: preflight queries that run before queries in this workflow.
      # Results are bound as variables and available to subsequent preflight queries and the main query.
      # Query-level preflight_queries run after these and can override the same binds.
      # Each preflight query must return exactly 1 row; all columns must be String type.
      # The same time range context ({time_start}, {time_end}) is available.
      preflight_queries:
        - sql: |
            SELECT max(Timestamp) AS DatasetMaxTS
            FROM {database:Identifier}.{table:Identifier}
          binds: [DatasetMaxTS]
          # Optional settings override
          settings:
            max_execution_time: "30"

      queries:
        - name: "Basic count — inherits default_time_range"
          sql: |
            SELECT count()
            FROM {database:Identifier}.{table:Identifier}
            WHERE Timestamp >= {time_start:DateTime64(3)}
              AND Timestamp <= {time_end:DateTime64(3)}
          # Optional per-query settings override
          settings:
            max_execution_time: "30"
          # Optional: perf targets, stored as metadata on each latency metric.
          perf:
            # You can define all or none of these, but it is limited to p50, p90, p95, and p99.
            p50: 500ms
            p90: 3s
            p95: 2s
            p99: 5s

        - name: "Fixed lookback override"
          sql: |
            SELECT count()
            FROM {database:Identifier}.{table:Identifier}
            WHERE Timestamp >= {time_start:DateTime64(3)}
              AND Timestamp <= {time_end:DateTime64(3)}
          time_range:
            type: fixed
            lookback: 1h  # always [anchor - 1h, anchor]

        - name: "Exponential lookback — investigation-heavy, favors short ranges"
          sql: |
            SELECT count()
            FROM {database:Identifier}.{table:Identifier}
            WHERE Timestamp >= {time_start:DateTime64(3)}
              AND Timestamp <= {time_end:DateTime64(3)}
          time_range:
            type: exponential
            round: 5m # optional rounding value, also available in default_time_range
            mean: 30m
            min:  5m   # optional clamp
            max:  24h  # optional clamp

        - name: "Log-normal lookback — mixed workload, short bias with long tail"
          sql: |
            SELECT count()
            FROM {database:Identifier}.{table:Identifier}
            WHERE Timestamp >= {time_start:DateTime64(3)}
              AND Timestamp <= {time_end:DateTime64(3)}
          time_range:
            type: log_normal
            mean: 2h
            min:  5m
            max:  24h
            # sigma controls spread and tail weight. Default: 0.5.
            sigma: 0.5

        - name: "Preflight queries — bind values before executing the main query"
          # preflight_queries run before the main query; their latency is not counted.
          # Each must return exactly 1 row; all columns must be String type.
          # The same time range context ({time_start}, {time_end}) is available.
          # Queries run in order — later ones can reference binds from earlier ones.
          # You may need to cast values to fit the type expected in the main query.
          # Optional query-level vars set constant binds before preflight_queries run.
          vars:
            FallbackTraceId: "00000000000000000000000000000000"
          preflight_queries:
            - sql: |
                SELECT TraceId, SpanName
                FROM {database:Identifier}.{table:Identifier}
                WHERE Timestamp >= {time_start:DateTime64(3)}
                  AND Timestamp <= {time_end:DateTime64(3)}
                ORDER BY Timestamp ASC
                LIMIT 1
              # Optional settings override
              settings:
                max_execution_time: "60"
              binds:
                - TraceId   # available as {TraceId:String} in subsequent queries and main query
                - SpanName  # available as {SpanName:String}
            - sql: |
                SELECT ServiceName
                FROM {database:Identifier}.{table:Identifier}
                WHERE TraceId = {TraceId:String}
                LIMIT 1
              binds:
                - ServiceName  # available as {ServiceName:String} in the main query
          sql: |
            SELECT *
            FROM {database:Identifier}.{table:Identifier}
            WHERE TraceId = {TraceId:String}
          time_range:
            type: uniform
            min: 15m
            max: 2h

        - name: "No time range — explicit opt-out of default_time_range"
          sql: |
            SELECT count() FROM {database:Identifier}.{table:Identifier}
          time_range:
            type: none