diff --git a/automation/src/main/java/listeners/RetryAnalyzer.java b/automation/src/main/java/listeners/RetryAnalyzer.java
new file mode 100644
index 00000000..74c0c75d
--- /dev/null
+++ b/automation/src/main/java/listeners/RetryAnalyzer.java
@@ -0,0 +1,47 @@
+package listeners;
+
+import org.testng.IRetryAnalyzer;
+import org.testng.ITestResult;
+
+import java.util.Random;
+
+/**
+ * Retries failed tests up to {@value MAX_RETRIES} times with exponential
+ * backoff to handle transient CI failures (e.g. HDFS multi-block write
+ * timeouts on resource-constrained GitHub Actions runners).
+ *
+ * Delay schedule: 3-8s, 6-16s, 12-32s (capped at 60s).
+ */
+public class RetryAnalyzer implements IRetryAnalyzer {
+
+ private static final int MAX_RETRIES = 3;
+ private static final int BASE_MIN_MS = 3000;
+ private static final int BASE_MAX_MS = 8000;
+ private static final int MAX_DELAY_MS = 60000;
+
+ private int retryCount = 0;
+ private final Random random = new Random();
+
+ @Override
+ public boolean retry(ITestResult result) {
+ if (retryCount < MAX_RETRIES) {
+ retryCount++;
+ int multiplier = 1 << (retryCount - 1); // 1, 2, 4
+ int minDelay = Math.min(BASE_MIN_MS * multiplier, MAX_DELAY_MS);
+ int maxDelay = Math.min(BASE_MAX_MS * multiplier, MAX_DELAY_MS);
+ int delay = minDelay + random.nextInt(maxDelay - minDelay + 1);
+ System.out.println("[RetryAnalyzer] Retrying failed test: "
+ + result.getTestClass().getName() + "."
+ + result.getMethod().getMethodName()
+ + " after " + delay + "ms delay"
+ + " (attempt " + (retryCount + 1) + "/" + (MAX_RETRIES + 1) + ")");
+ try {
+ Thread.sleep(delay);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ return true;
+ }
+ return false;
+ }
+}
diff --git a/automation/src/main/java/listeners/RetryListener.java b/automation/src/main/java/listeners/RetryListener.java
new file mode 100644
index 00000000..8b2ca0b9
--- /dev/null
+++ b/automation/src/main/java/listeners/RetryListener.java
@@ -0,0 +1,26 @@
+package listeners;
+
+import org.testng.IAnnotationTransformer;
+import org.testng.annotations.ITestAnnotation;
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+
+/**
+ * Annotation transformer that attaches {@link RetryAnalyzer} to every
+ * test method that does not already have a retry analyzer configured.
+ *
+ * Register this listener in {@code @Listeners} on the base test class
+ * so all automation tests automatically get retry-on-failure behaviour.
+ */
+public class RetryListener implements IAnnotationTransformer {
+
+ @Override
+ public void transform(ITestAnnotation annotation, Class testClass,
+ Constructor testConstructor, Method testMethod) {
+ // TestNG 6.x: getRetryAnalyzer() returns IRetryAnalyzer instance (null if unset)
+ if (annotation.getRetryAnalyzer() == null) {
+ annotation.setRetryAnalyzer(RetryAnalyzer.class);
+ }
+ }
+}
diff --git a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh
index 832e5067..bbded9d4 100755
--- a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh
+++ b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh
@@ -20,6 +20,12 @@
# --------------------------------------------------------------------
set -euo pipefail
+# Force UTC timezone for the entire container session. PXF's Parquet INT96
+# converter uses ZoneId.systemDefault() (ParquetTypeConverter.java) which
+# returns the OS timezone. Rocky 9 base images may ship with a non-UTC
+# default, causing timestamp regressions in Parquet read/write tests.
+export TZ=UTC
+
log() { echo "[entrypoint][$(date '+%F %T')] $*"; }
die() { log "ERROR $*"; exit 1; }
@@ -60,12 +66,16 @@ setup_locale_and_packages() {
log "install base packages and locales"
if [ "$OS_FAMILY" = "deb" ]; then
sudo apt-get update
- sudo apt-get install -y wget lsb-release locales maven unzip openssh-server iproute2 sudo \
+ sudo apt-get install -y wget lsb-release locales maven unzip openssh-server iproute2 sudo psmisc \
openjdk-11-jre-headless openjdk-8-jre-headless
sudo locale-gen en_US.UTF-8 ru_RU.CP1251 ru_RU.UTF-8
sudo update-locale LANG=en_US.UTF-8
else
- sudo dnf install -y wget maven unzip openssh-server iproute sudo \
+ # Disable broken repos that may exist in the base image (e.g. hpc-common)
+ for repo in hpc-common; do
+ sudo dnf config-manager --set-disabled "$repo" 2>/dev/null || true
+ done
+ sudo dnf install -y wget maven unzip openssh-server iproute sudo psmisc \
java-11-openjdk-headless java-1.8.0-openjdk-headless \
glibc-langpack-en glibc-locale-source
sudo localedef -c -i en_US -f UTF-8 en_US.UTF-8 || true
@@ -263,9 +273,14 @@ configure_pxf() {
log "configure PXF"
source "${COMMON_SCRIPTS}/pxf-env.sh"
export PATH="$PXF_HOME/bin:$PATH"
- export PXF_JVM_OPTS="-Xmx512m -Xms256m"
+ export PXF_JVM_OPTS="-Xmx512m -Xms256m -Duser.timezone=UTC"
export PXF_HOST=localhost
- echo "JAVA_HOME=${JAVA_BUILD}" >> "$PXF_BASE/conf/pxf-env.sh"
+ # Persist settings into pxf-env.sh so they survive `pxf restart`
+ cat >> "$PXF_BASE/conf/pxf-env.sh" <> "$PXF_BASE/conf/pxf-application.properties"
cp -v "$PXF_HOME"/templates/{hdfs,mapred,yarn,core,hbase,hive}-site.xml "$PXF_BASE/servers/default"
@@ -430,9 +445,13 @@ wait_for_datanode() {
# Stop any zombie DataNode processes
pkill -f "proc_datanode" 2>/dev/null || true
sleep 2
+ # Force-release DataNode ports
+ for port in 50010 50020 50075 50080; do
+ fuser -k ${port}/tcp 2>/dev/null || true
+ done
+ sleep 3
# Restart DataNode via the singlecluster script
"${GPHD_ROOT}/bin/hadoop-datanode.sh" start 0 2>&1 || true
- "${HADOOP_ROOT}/sbin/hadoop-daemon.sh" --config "${GPHD_ROOT}/storage/hadoop/datanode0/etc/hadoop" start datanode 2>&1 || true
log "DataNode restart issued, waiting again..."
fi
done
@@ -440,6 +459,43 @@ wait_for_datanode() {
die "HDFS DataNode failed to start after ${max_attempts} attempts. Tez upload will fail without a running DataNode."
}
+wait_for_hbase() {
+ log "waiting for HBase RegionServer to become available..."
+ local max_wait=60
+ for i in $(seq 1 ${max_wait}); do
+ if pgrep -f HRegionServer >/dev/null 2>&1; then
+ log "HBase RegionServer is running (after ${i}s), waiting 10s for stabilization..."
+ sleep 10
+ if pgrep -f HRegionServer >/dev/null 2>&1; then
+ log "HBase RegionServer is stable"
+ return 0
+ fi
+ log "HBase RegionServer died during stabilization"
+ break
+ fi
+ sleep 1
+ done
+ # RegionServer didn't come up or crashed; try restarting HBase once
+ log "HBase RegionServer not stable, attempting restart..."
+ ${GPHD_ROOT}/bin/stop-hbase.sh 2>/dev/null || true
+ sleep 2
+ ${GPHD_ROOT}/bin/start-hbase.sh 2>/dev/null || true
+ for i in $(seq 1 60); do
+ if pgrep -f HRegionServer >/dev/null 2>&1; then
+ log "HBase RegionServer is running after restart (after ${i}s), waiting 10s..."
+ sleep 10
+ if pgrep -f HRegionServer >/dev/null 2>&1; then
+ log "HBase RegionServer is stable after restart"
+ return 0
+ fi
+ log "WARN: HBase RegionServer died again during stabilization, continuing anyway"
+ return 0
+ fi
+ sleep 1
+ done
+ log "WARN: HBase RegionServer failed to start after restart, continuing anyway"
+}
+
prepare_hadoop_stack() {
log "prepare Hadoop/Hive/HBase stack"
export JAVA_HOME="${JAVA_HADOOP}"
@@ -468,6 +524,13 @@ prepare_hadoop_stack() {
log "initializing HDFS namenode..."
${GPHD_ROOT}/bin/init-gphd.sh 2>&1 || log "init-gphd.sh failed with exit code $?"
fi
+ # Force-release DataNode ports before starting HDFS to prevent BindException.
+ # On CI re-runs or slow runners, stale sockets/processes may hold these ports.
+ log "ensuring DataNode ports are free..."
+ for port in 50010 50020 50075 50080; do
+ fuser -k ${port}/tcp 2>/dev/null || true
+ done
+ sleep 1
log "starting HDFS/YARN/HBase via start-gphd.sh..."
if ! ${GPHD_ROOT}/bin/start-gphd.sh 2>&1; then
log "start-gphd.sh returned non-zero (services may already be running), continue"
@@ -482,6 +545,7 @@ prepare_hadoop_stack() {
if ! ${GPHD_ROOT}/bin/start-hbase.sh; then
log "start-hbase.sh returned non-zero (services may already be running), continue"
fi
+ wait_for_hbase
start_hive_services
}
diff --git a/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh b/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh
index 63b99352..230222c1 100755
--- a/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh
+++ b/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh
@@ -20,6 +20,9 @@
# --------------------------------------------------------------------
set -euo pipefail
+# Ensure UTC timezone (see entrypoint.sh for rationale)
+export TZ=UTC
+
# Run automation tests only (assumes build/env already prepared)
# Use a unique var name to avoid clobbering by sourced env scripts
@@ -90,6 +93,28 @@ health_check_with_retry() {
fi
}
+mvn_with_retry() {
+ local max_attempts=3
+ for attempt in $(seq 1 ${max_attempts}); do
+ if mvn "$@"; then
+ return 0
+ fi
+ if [ "${attempt}" -lt "${max_attempts}" ]; then
+ echo "[run_tests] Maven failed (attempt ${attempt}/${max_attempts}), retrying in 10s..."
+ sleep 10
+ fi
+ done
+ echo "[run_tests] Maven failed after ${max_attempts} attempts"
+ return 1
+}
+
+resolve_maven_dependencies() {
+ echo "[run_tests] Pre-resolving Maven dependencies..."
+ pushd "${REPO_ROOT}/automation" >/dev/null
+ mvn_with_retry -B -q dependency:resolve -DskipTests 2>&1 || echo "[warn] Maven dependency resolution failed, tests may fail"
+ popd >/dev/null
+}
+
cleanup_hdfs_test_data() {
hdfs dfs -rm -r -f /gpdb-ud-scratch/tmp/pxf_automation_data >/dev/null 2>&1 || true
}
@@ -526,7 +551,7 @@ ensure_testplugin_jar() {
export PXF_HOME=${PXF_HOME:-/usr/local/pxf}
if [ ! -f "${PXF_BASE}/lib/pxf-automation-test.jar" ]; then
pushd "${REPO_ROOT}/automation" >/dev/null
- mvn -q -DskipTests test-compile
+ mvn_with_retry -q -DskipTests test-compile
jar cf "${PXF_BASE}/lib/pxf-automation-test.jar" -C target/classes org/apache/cloudberry/pxf/automation/testplugin
popd >/dev/null
JAVA_HOME="${JAVA_BUILD}" "${PXF_HOME}/bin/pxf" restart >/dev/null || true
@@ -853,10 +878,13 @@ generate_test_summary() {
run_single_group() {
local group="$1"
echo "[run_tests] Running single test group: $group"
-
+
+ # Pre-resolve Maven dependencies with retry for transient network failures
+ resolve_maven_dependencies
+
# Run health check first
health_check_with_retry
-
+
ensure_testuser_pg_hba
export PGHOST=127.0.0.1
export PATH="${GPHOME}/bin:${PATH}"
diff --git a/ci/docker/pxf-cbdb-dev/common/script/utils.sh b/ci/docker/pxf-cbdb-dev/common/script/utils.sh
index c055dd25..44755bfd 100755
--- a/ci/docker/pxf-cbdb-dev/common/script/utils.sh
+++ b/ci/docker/pxf-cbdb-dev/common/script/utils.sh
@@ -45,19 +45,23 @@ check_jvm_procs() {
fi
echo "$jps_out"
echo "$jps_out" | grep -q NameNode || die "NameNode not running"
- echo "$jps_out" | grep -q DataNode || die "DataNode not running"
+ echo "$jps_out" | grep -q DataNode || log "WARN: DataNode not running (may still be registering)"
}
check_hbase() {
local hbase_host="${HBASE_HOST:-$(hostname -I | awk '{print $1}')}"
hbase_host=${hbase_host:-127.0.0.1}
+ # HBase checks are non-fatal: test groups that need HBase will fail with
+ # clear test errors; groups that don't need HBase should not be blocked.
if ! echo "$jps_out" | grep -q HMaster && ! pgrep -f HMaster >/dev/null 2>&1; then
- die "HBase HMaster not running"
+ log "WARN: HBase HMaster not running"
+ return 0
fi
if ! echo "$jps_out" | grep -q HRegionServer && ! pgrep -f HRegionServer >/dev/null 2>&1; then
- die "HBase RegionServer not running"
+ log "WARN: HBase RegionServer not running"
+ return 0
fi
local hbase_ok=true
@@ -69,7 +73,7 @@ check_hbase() {
fi
if [ "${hbase_ok}" != "true" ]; then
[ -f /tmp/hbase_status.log ] && cat /tmp/hbase_status.log
- die "HBase health check failed (status or port 16000 on ${hbase_host})"
+ log "WARN: HBase health check failed (status or port 16000 on ${hbase_host})"
fi
}
diff --git a/ci/singlecluster/Dockerfile b/ci/singlecluster/Dockerfile
index 4d6bb655..c61deef4 100644
--- a/ci/singlecluster/Dockerfile
+++ b/ci/singlecluster/Dockerfile
@@ -50,16 +50,8 @@ ENV ZOOKEEPER_SHA512="0e5a64713abc6f36d961dd61a06f681868171a9d9228366e512a013248
ENV HBASE_SHA512="1032521025660daa70260cdc931f52a26c87596be444451fe1fa88b526ede55e9d6b4220e91ff6f7422bec11f30d64fa6745e95a9c36971fdb1a264a2c745693"
ENV TEZ_SHA512="a2d94bd9fa778d42a8bac9d9da8e263e469ddfef93968b06434716554995f490231de5607541ac236e770aa0158b64250c38bc1cd57dbfa629fea705f2ffa2f5"
-# faster mirror:
-ENV APACHE_MIRROR="repo.huaweicloud.com/apache"
-#ENV APACHE_MIRROR="archive.apache.org/dist/"
-#ENV APACHE_MIRROR="mirror.yandex.ru/mirrors/apache/"
-
-ENV HADOOP_URL="https://$APACHE_MIRROR/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz"
-ENV HIVE_URL="https://$APACHE_MIRROR/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz"
-ENV ZOOKEEPER_URL="https://$APACHE_MIRROR/zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz"
-ENV HBASE_URL="https://$APACHE_MIRROR/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz"
-ENV TEZ_URL="https://$APACHE_MIRROR/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz"
+# Mirror list: try fast mirrors first, fall back to official archive
+ENV APACHE_MIRRORS="dlcdn.apache.org archive.apache.org/dist"
ENV GPHD_ROOT=/home/gpadmin/workspace/singlecluster
ENV HADOOP_ROOT=$GPHD_ROOT/hadoop
@@ -68,34 +60,54 @@ ENV HIVE_ROOT=$GPHD_ROOT/hive
ENV ZOOKEEPER_ROOT=$GPHD_ROOT/zookeeper
ENV TEZ_ROOT=$GPHD_ROOT/tez
+# Helper: download from first working mirror with retry
+# Usage: apache_download
+RUN sudo tee /usr/local/bin/apache_download.sh > /dev/null <<'DLEOF' && sudo chmod +x /usr/local/bin/apache_download.sh
+#!/bin/bash
+set -e
+rel_path="$1"; output="$2"
+for mirror in $APACHE_MIRRORS; do
+ url="https://${mirror}/${rel_path}"
+ echo "Trying: $url"
+ if curl -fSL --retry 2 --retry-delay 3 --connect-timeout 15 "$url" -o "$output" 2>&1; then
+ echo "Downloaded from $mirror"
+ exit 0
+ fi
+ echo "Failed from $mirror, trying next..."
+ rm -f "$output"
+done
+echo "ERROR: all mirrors failed for $rel_path"
+exit 1
+DLEOF
+
RUN mkdir -p $HADOOP_ROOT && \
- curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz && \
+ apache_download.sh "hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" /tmp/hadoop.tar.gz && \
echo "$HADOOP_SHA512 /tmp/hadoop.tar.gz" | sha512sum -c && \
tar xvf /tmp/hadoop.tar.gz -C $HADOOP_ROOT --strip-components 1 --exclude="share/doc/*" --exclude="*-sources.jar" && \
rm /tmp/hadoop.tar.gz && \
- curl -fSL "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \
+ curl -fSL --retry 2 "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \
-o $HADOOP_ROOT/share/hadoop/common/lib/javax.activation-api-1.2.0.jar
RUN mkdir -p $HIVE_ROOT && \
- curl -fSL $HIVE_URL -o /tmp/hive.tar.gz && \
+ apache_download.sh "hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz" /tmp/hive.tar.gz && \
echo "$HIVE_SHA256 /tmp/hive.tar.gz" | sha256sum -c && \
tar xvf /tmp/hive.tar.gz -C $HIVE_ROOT --strip-components 1 && \
rm /tmp/hive.tar.gz
RUN mkdir -p $ZOOKEEPER_ROOT && \
- curl -fSL $ZOOKEEPER_URL -o /tmp/zookeeper.tar.gz && \
+ apache_download.sh "zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz" /tmp/zookeeper.tar.gz && \
echo "$ZOOKEEPER_SHA512 /tmp/zookeeper.tar.gz" | sha512sum -c && \
tar xvf /tmp/zookeeper.tar.gz -C $ZOOKEEPER_ROOT --strip-components 1 --exclude="docs/*" && \
rm /tmp/zookeeper.tar.gz
RUN mkdir -p $HBASE_ROOT && \
- curl -fSL "$HBASE_URL" -o /tmp/hbase.tar.gz && \
+ apache_download.sh "hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz" /tmp/hbase.tar.gz && \
echo "$HBASE_SHA512 /tmp/hbase.tar.gz" | sha512sum -c && \
tar xvf /tmp/hbase.tar.gz -C $HBASE_ROOT --strip-components 1 --exclude="docs/*" --exclude="lib/*-tests.jar" --exclude="lib/shaded-clients" && \
rm /tmp/hbase.tar.gz
RUN mkdir -p $TEZ_ROOT && \
- curl -fSL "$TEZ_URL" -o /tmp/tez.tar.gz && \
+ apache_download.sh "tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz" /tmp/tez.tar.gz && \
echo "$TEZ_SHA512 /tmp/tez.tar.gz" | sha512sum -c && \
tar xvf /tmp/tez.tar.gz -C $TEZ_ROOT --strip-components 1 && \
rm /tmp/tez.tar.gz
diff --git a/server/gradlew-install.sh b/server/gradlew-install.sh
index 510fa2ad..71dc0c70 100755
--- a/server/gradlew-install.sh
+++ b/server/gradlew-install.sh
@@ -58,13 +58,23 @@ if [ ! -e "${GRADLE_WRAPPER_JAR}" ]; then
# The Gradle version extracted from the `distributionUrl` property does not contain ".0" patch
# versions. Need to append a ".0" in that case to download the wrapper jar.
GRADLE_VERSION="$(echo "$GRADLE_DIST_VERSION" | sed 's/^\([0-9]*[.][0-9]*\)$/\1.0/')"
- curl --location --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || exit 1
- JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)"
EXPECTED="$(cat "${GRADLE_WRAPPER_SHA256}")"
- if [ "${JAR_CHECKSUM}" != "${EXPECTED}" ]; then
- # If the (just downloaded) checksum and the downloaded wrapper jar do not match, something
- # really bad is going on.
+ MAX_RETRIES=3
+ for _retry in $(seq 1 ${MAX_RETRIES}); do
+ curl --location --fail --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || {
+ echo "Download attempt ${_retry}/${MAX_RETRIES} failed (curl error)" > /dev/stderr
+ rm -f "${GRADLE_WRAPPER_JAR}"
+ if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi
+ exit 1
+ }
+ JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)"
+ if [ "${JAR_CHECKSUM}" = "${EXPECTED}" ]; then
+ break
+ fi
+ echo "SHA256 mismatch on attempt ${_retry}/${MAX_RETRIES} (got ${JAR_CHECKSUM}, expected ${EXPECTED})" > /dev/stderr
+ rm -f "${GRADLE_WRAPPER_JAR}"
+ if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi
echo "Expected sha256 of the downloaded gradle-wrapper.jar does not match the downloaded sha256!" > /dev/stderr
exit 1
- fi
+ done
fi