ClickHouse · caetanosauer · Jun 23, 2026
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,8 @@ log.txt
 load_out.txt
 server.log
 server.pid
+server.endpoint
+server.endpoint.tmp
 arc_token.txt
 data-size.txt
 .doris_home

diff --git a/hyper-parquet/benchmark.sh b/hyper-parquet/benchmark.sh
@@ -2,10 +2,9 @@
 # Thin shim — actual flow is in lib/benchmark-common.sh.
 export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned"
 export BENCH_DURABLE=yes
-export BENCH_RESTARTABLE=no
-# Single-process engine: each query forks a fresh full-machine process with no
-# shared scheduler across connections, so the concurrent-QPS test only
-# oversubscribes RAM rather than measuring throughput. Skip it by default;
-# override BENCH_CONCURRENT_DURATION to re-enable. See issue #946.
-export BENCH_CONCURRENT_DURATION="${BENCH_CONCURRENT_DURATION:-0}"
+# RESTARTABLE=yes: ./start now launches a persistent hyperd whose lifecycle
+# matters, so the driver's cold cycle (stop -> wait_stopped -> drop_caches ->
+# start) gives an honest cold try 1 while tries 2..N stay hot on the warm
+# server. See issue #936.
+export BENCH_RESTARTABLE=yes
 exec ../lib/benchmark-common.sh
diff --git a/hyper-parquet/check b/hyper-parquet/check
@@ -1,12 +1,24 @@
 #!/bin/bash
+# Readiness probe: connect to the persistent Hyper server (via the descriptor
+# ./start published to server.endpoint) and run SELECT 1. Non-zero exit means
+# "not up yet" — the benchmark driver polls this in a loop after ./start and
+# uses its transition to failing as the "server is really stopped" signal in
+# the cold cycle.
 set -e
 
 # shellcheck disable=SC1091
 source myenv/bin/activate
 
+# No endpoint published => server isn't up.
+[ -s server.endpoint ] || exit 1
+
 python3 - <<'PY'
-from tableauhyperapi import HyperProcess, Telemetry, Connection
-with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
-    with Connection(hyper.endpoint) as connection:
-        connection.execute_list_query("SELECT 1")
+from tableauhyperapi import Connection, Endpoint
+
+with open("server.endpoint") as f:
+    descriptor = f.read().strip()
+
+endpoint = Endpoint(connection_descriptor=descriptor, user_agent="clickbench")
+with Connection(endpoint) as connection:
+    connection.execute_list_query("SELECT 1")
 PY
diff --git a/hyper-parquet/query b/hyper-parquet/query
@@ -1,9 +1,16 @@
 #!/bin/bash
-# Reads a SQL query from stdin, runs it via tableau hyperapi against the
-# partitioned parquet files (registered as a temp external table from
-# create.sql).
+# Reads a SQL query from stdin, runs it once against the partitioned parquet
+# files on the PERSISTENT Hyper server started by ./start (descriptor in
+# server.endpoint). The temp external table is (re)created from create.sql
+# before the timer starts, so its setup is not counted.
 # Stdout: query result.
 # Stderr: query runtime in fractional seconds on the last line.
+#
+# The benchmark driver calls this once per try (BENCH_TRIES). Because every
+# call connects to the SAME long-lived server (and the parquet files stay in
+# the OS page cache between tries), try 1 (right after the driver's
+# stop/drop_caches/start cold cycle) is cold and tries 2..N are genuinely hot.
+# See issue #936.
 set -e
 
 # shellcheck disable=SC1091
@@ -18,17 +25,20 @@ cat > "$query_file"
 python3 - "$query_file" <<'PY'
 import sys
 import timeit
-from tableauhyperapi import HyperProcess, Telemetry, Connection
+from tableauhyperapi import Connection, Endpoint
 
 with open(sys.argv[1]) as f:
     query = f.read()
 
-with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
-    with Connection(hyper.endpoint) as connection:
-        connection.execute_command(open("create.sql").read())
-        start = timeit.default_timer()
-        rows = connection.execute_list_query(query)
-        end = timeit.default_timer()
+with open("server.endpoint") as f:
+    descriptor = f.read().strip()
+
+endpoint = Endpoint(connection_descriptor=descriptor, user_agent="clickbench")
+with Connection(endpoint) as connection:
+    connection.execute_command(open("create.sql").read())
+    start = timeit.default_timer()
+    rows = connection.execute_list_query(query)
+    end = timeit.default_timer()
 
 for r in rows:
     print(r)

diff --git a/hyper-parquet/start b/hyper-parquet/start
@@ -1,2 +1,75 @@
 #!/bin/bash
-exit 0
+# Launch ONE long-lived Hyper server (hyperd) and publish its connection
+# descriptor to server.endpoint. Every ./query invocation then connects to
+# this single persistent process instead of spawning its own.
+#
+# This is the fix for issue #936: the per-query benchmark driver calls
+# ./query once per try (BENCH_TRIES), keeping a daemon-backed server warm so
+# tries 2..N are genuinely hot. The previous Hyper ./query opened a brand-new
+# HyperProcess on every call, so each "hot" try paid the full cold cost (fresh
+# server, re-parsed parquet metadata, cold OS page cache). With a persistent
+# server those caches stay warm across tries, and the driver's cold cycle
+# (stop -> drop_caches -> start) still gives an honest cold try 1.
+#
+# Unlike hyper/, there is no hits.hyper to keep attached: the data is external
+# parquet read through the OS page cache (which the driver preserves between
+# tries), and the temp external table is connection-scoped so each ./query
+# recreates it untimed.
+set -e
+
+# shellcheck disable=SC1091
+source myenv/bin/activate
+
+# Already running? The pidfile + a live process is authoritative.
+if [ -f server.pid ] && kill -0 "$(cat server.pid 2>/dev/null)" 2>/dev/null; then
+    exit 0
+fi
+
+# Clean up stale artifacts from a previous (possibly crashed) server.
+rm -f server.pid server.endpoint
+
+# Background a supervisor that opens HyperProcess, writes the descriptor, then
+# blocks until ./stop signals it. nohup so it survives this script exiting;
+# $! is the supervisor PID we kill in ./stop.
+nohup python3 - >server.log 2>&1 <<'PY' &
+import os
+import signal
+import sys
+from tableauhyperapi import HyperProcess, Telemetry
+
+
+def _terminate(*_):
+    # Raise SystemExit so the `with HyperProcess` block exits cleanly and
+    # hyperd is shut down with us (it is terminated when its controlling
+    # process exits).
+    sys.exit(0)
+
+
+signal.signal(signal.SIGTERM, _terminate)
+signal.signal(signal.SIGINT, _terminate)
+
+with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
+    # Publish the descriptor atomically so ./check and ./query never read a
+    # half-written file.
+    with open("server.endpoint.tmp", "w") as f:
+        f.write(hyper.endpoint.connection_descriptor)
+    os.replace("server.endpoint.tmp", "server.endpoint")
+    # Block until a signal arrives; loop so a stray signal can't tear the
+    # server down (only the handler's sys.exit does).
+    while True:
+        signal.pause()
+PY
+echo $! > server.pid
+
+# Give the supervisor a moment to publish the endpoint. The benchmark driver
+# also runs ./check in a loop afterwards, so this is just a fast-path / clean
+# error rather than the authoritative readiness gate.
+for _ in $(seq 1 60); do
+    if [ -s server.endpoint ]; then
+        exit 0
+    fi
+    sleep 1
+done
+
+echo "hyper-parquet: server did not publish server.endpoint within 60s" >&2
+exit 1
diff --git a/hyper-parquet/stop b/hyper-parquet/stop
@@ -1,2 +1,24 @@
 #!/bin/bash
+# Stop the persistent Hyper server started by ./start. SIGTERM the supervisor
+# (see ./start); its handler exits the `with HyperProcess` block, which shuts
+# down hyperd. Idempotent: a missing/stale pidfile is not an error.
+set -e
+
+if [ -f server.pid ]; then
+    pid="$(cat server.pid 2>/dev/null || true)"
+    if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
+        kill "$pid" 2>/dev/null || true
+        # Wait for the supervisor (and thus hyperd) to actually exit so the
+        # benchmark driver's drop_caches isn't defeated by pages still pinned
+        # by a live mmap.
+        for _ in $(seq 1 60); do
+            kill -0 "$pid" 2>/dev/null || break
+            sleep 1
+        done
+        # Still alive after 60s? Force it.
+        kill -9 "$pid" 2>/dev/null || true
+    fi
+fi
+
+rm -f server.pid server.endpoint
 exit 0
diff --git a/hyper/benchmark.sh b/hyper/benchmark.sh
@@ -2,10 +2,9 @@
 # Thin shim — actual flow is in lib/benchmark-common.sh.
 export BENCH_DOWNLOAD_SCRIPT="download-hits-csv"
 export BENCH_DURABLE=yes
-export BENCH_RESTARTABLE=no
-# Single-process engine: each query forks a fresh full-machine process with no
-# shared scheduler across connections, so the concurrent-QPS test only
-# oversubscribes RAM rather than measuring throughput. Skip it by default;
-# override BENCH_CONCURRENT_DURATION to re-enable. See issue #946.
-export BENCH_CONCURRENT_DURATION="${BENCH_CONCURRENT_DURATION:-0}"
+# RESTARTABLE=yes: ./start now launches a persistent hyperd whose lifecycle
+# matters, so the driver's cold cycle (stop -> wait_stopped -> drop_caches ->
+# start) gives an honest cold try 1 while tries 2..N stay hot on the warm
+# server. See issue #936.
+export BENCH_RESTARTABLE=yes
 exec ../lib/benchmark-common.sh
diff --git a/hyper/check b/hyper/check
@@ -1,12 +1,24 @@
 #!/bin/bash
+# Readiness probe: connect to the persistent Hyper server (via the descriptor
+# ./start published to server.endpoint) and run SELECT 1. Non-zero exit means
+# "not up yet" — the benchmark driver polls this in a loop after ./start and
+# uses its transition to failing as the "server is really stopped" signal in
+# the cold cycle.
 set -e
 
 # shellcheck disable=SC1091
 source myenv/bin/activate
 
+# No endpoint published => server isn't up.
+[ -s server.endpoint ] || exit 1
+
 python3 - <<'PY'
-from tableauhyperapi import HyperProcess, Telemetry, Connection
-with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
-    with Connection(hyper.endpoint) as connection:
-        connection.execute_list_query("SELECT 1")
+from tableauhyperapi import Connection, Endpoint
+
+with open("server.endpoint") as f:
+    descriptor = f.read().strip()
+
+endpoint = Endpoint(connection_descriptor=descriptor, user_agent="clickbench")
+with Connection(endpoint) as connection:
+    connection.execute_list_query("SELECT 1")
 PY
diff --git a/hyper/load b/hyper/load
@@ -1,4 +1,8 @@
 #!/bin/bash
+# Create hits.hyper and COPY hits.csv into it, using the PERSISTENT Hyper
+# server started by ./start (descriptor in server.endpoint). Loading through
+# the already-running server avoids briefly running two hyperd instances
+# (each of which would try to claim up to 80% of RAM) during the heavy COPY.
 set -e
 
 # shellcheck disable=SC1091
@@ -8,12 +12,15 @@ source myenv/bin/activate
 rm -f hits.hyper
 
 python3 - <<'PY'
-from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode
+from tableauhyperapi import Connection, Endpoint, CreateMode
 
-with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
-    with Connection(hyper.endpoint, 'hits.hyper', CreateMode.CREATE_AND_REPLACE) as connection:
-        connection.execute_command(open("create.sql").read())
-        connection.execute_command("copy hits from 'hits.csv' with (format csv)")
+with open("server.endpoint") as f:
+    descriptor = f.read().strip()
+
+endpoint = Endpoint(connection_descriptor=descriptor, user_agent="clickbench")
+with Connection(endpoint, 'hits.hyper', CreateMode.CREATE_AND_REPLACE) as connection:
+    connection.execute_command(open("create.sql").read())
+    connection.execute_command("copy hits from 'hits.csv' with (format csv)")
 PY
 
 rm -f hits.csv

diff --git a/hyper/query b/hyper/query
@@ -1,8 +1,13 @@
 #!/bin/bash
-# Reads a SQL query from stdin, runs it via tableau hyperapi against
-# hits.hyper.
+# Reads a SQL query from stdin, runs it once against hits.hyper on the
+# PERSISTENT Hyper server started by ./start (descriptor in server.endpoint).
 # Stdout: query result.
 # Stderr: query runtime in fractional seconds on the last line.
+#
+# The benchmark driver calls this once per try (BENCH_TRIES). Because every
+# call connects to the SAME long-lived server, the buffer pool stays warm
+# across tries: try 1 (right after the driver's stop/drop_caches/start cold
+# cycle) is cold, tries 2..N are genuinely hot. See issue #936.
 set -e
 
 # shellcheck disable=SC1091
@@ -17,16 +22,19 @@ cat > "$query_file"
 python3 - "$query_file" <<'PY'
 import sys
 import timeit
-from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode
+from tableauhyperapi import Connection, Endpoint
 
 with open(sys.argv[1]) as f:
     query = f.read()
 
-with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
-    with Connection(hyper.endpoint, 'hits.hyper', CreateMode.NONE) as connection:
-        start = timeit.default_timer()
-        rows = connection.execute_list_query(query)
-        end = timeit.default_timer()
+with open("server.endpoint") as f:
+    descriptor = f.read().strip()
+
+endpoint = Endpoint(connection_descriptor=descriptor, user_agent="clickbench")
+with Connection(endpoint, 'hits.hyper') as connection:
+    start = timeit.default_timer()
+    rows = connection.execute_list_query(query)
+    end = timeit.default_timer()
 
 for r in rows:
     print(r)