Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions .github/workflows/pxf-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,15 @@ jobs:
with:
path: cloudberry-pxf

- name: Cache singlecluster image
id: cache-image
uses: actions/cache@v4
with:
path: /tmp/singlecluster-image.tar
key: singlecluster-ubuntu-${{ hashFiles('ci/singlecluster/**') }}

- name: Build singlecluster image
if: steps.cache-image.outputs.cache-hit != 'true'
run: |
cd cloudberry-pxf/ci/singlecluster
docker build -t pxf/singlecluster:3 .
Expand Down Expand Up @@ -185,7 +193,15 @@ jobs:
with:
path: cloudberry-pxf

- name: Cache singlecluster Rocky 9 image
id: cache-image-rocky9
uses: actions/cache@v4
with:
path: /tmp/singlecluster-rocky9-image.tar
key: singlecluster-rocky9-${{ hashFiles('ci/singlecluster/**') }}

- name: Build singlecluster Rocky 9 image
if: steps.cache-image-rocky9.outputs.cache-hit != 'true'
run: |
cd cloudberry-pxf/ci/singlecluster
docker build --build-arg BASE_IMAGE=apache/incubator-cloudberry:cbdb-build-rocky9-latest -t pxf/singlecluster-rocky9:3 .
Expand Down Expand Up @@ -364,8 +380,8 @@ jobs:
FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}"
SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}"

if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
exit 1
fi

Expand Down Expand Up @@ -536,8 +552,8 @@ jobs:
FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}"
SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}"

if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
exit 1
fi

Expand Down
6 changes: 6 additions & 0 deletions automation/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@
<argLine>-Xmx4096m</argLine>
<forkCount>1</forkCount>
<reuseForks>false</reuseForks>
<properties>
<property>
<name>listener</name>
<value>listeners.RetryListener</value>
</property>
</properties>
</configuration>
<executions>
<execution>
Expand Down
47 changes: 47 additions & 0 deletions automation/src/main/java/listeners/RetryAnalyzer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package listeners;

import org.testng.IRetryAnalyzer;
import org.testng.ITestResult;

import java.util.Random;

/**
* Retries failed tests up to {@value MAX_RETRIES} times with exponential
* backoff to handle transient CI failures (e.g. HDFS multi-block write
* timeouts on resource-constrained GitHub Actions runners).
*
* <p>Delay schedule: 3-8s, 6-16s, 12-32s (capped at 60s).
*/
public class RetryAnalyzer implements IRetryAnalyzer {

private static final int MAX_RETRIES = 3;
private static final int BASE_MIN_MS = 3000;
private static final int BASE_MAX_MS = 8000;
private static final int MAX_DELAY_MS = 60000;

private int retryCount = 0;
private final Random random = new Random();

@Override
public boolean retry(ITestResult result) {
if (retryCount < MAX_RETRIES) {
retryCount++;
int multiplier = 1 << (retryCount - 1); // 1, 2, 4
int minDelay = Math.min(BASE_MIN_MS * multiplier, MAX_DELAY_MS);
int maxDelay = Math.min(BASE_MAX_MS * multiplier, MAX_DELAY_MS);
int delay = minDelay + random.nextInt(maxDelay - minDelay + 1);
System.out.println("[RetryAnalyzer] Retrying failed test: "
+ result.getTestClass().getName() + "."
+ result.getMethod().getMethodName()
+ " after " + delay + "ms delay"
+ " (attempt " + (retryCount + 1) + "/" + (MAX_RETRIES + 1) + ")");
try {
Thread.sleep(delay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
return true;
}
return false;
}
}
26 changes: 26 additions & 0 deletions automation/src/main/java/listeners/RetryListener.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package listeners;

import org.testng.IAnnotationTransformer;
import org.testng.annotations.ITestAnnotation;

import java.lang.reflect.Constructor;
import java.lang.reflect.Method;

/**
* Annotation transformer that attaches {@link RetryAnalyzer} to every
* test method that does not already have a retry analyzer configured.
* <p>
* Register this listener in {@code @Listeners} on the base test class
* so all automation tests automatically get retry-on-failure behaviour.
*/
public class RetryListener implements IAnnotationTransformer {

@Override
public void transform(ITestAnnotation annotation, Class testClass,
Constructor testConstructor, Method testMethod) {
// TestNG 6.x: getRetryAnalyzer() returns IRetryAnalyzer instance (null if unset)
if (annotation.getRetryAnalyzer() == null) {
annotation.setRetryAnalyzer(RetryAnalyzer.class);
}
}
}
67 changes: 63 additions & 4 deletions ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
# --------------------------------------------------------------------
set -euo pipefail

# Force UTC timezone for the entire container session. PXF's Parquet INT96
# converter uses ZoneId.systemDefault() (ParquetTypeConverter.java) which
# returns the OS timezone. Rocky 9 base images may ship with a non-UTC
# default, causing timestamp regressions in Parquet read/write tests.
export TZ=UTC

log() { echo "[entrypoint][$(date '+%F %T')] $*"; }
die() { log "ERROR $*"; exit 1; }

Expand Down Expand Up @@ -60,12 +66,16 @@ setup_locale_and_packages() {
log "install base packages and locales"
if [ "$OS_FAMILY" = "deb" ]; then
sudo apt-get update
sudo apt-get install -y wget lsb-release locales maven unzip openssh-server iproute2 sudo \
sudo apt-get install -y wget lsb-release locales maven unzip openssh-server iproute2 sudo psmisc \
openjdk-11-jre-headless openjdk-8-jre-headless
sudo locale-gen en_US.UTF-8 ru_RU.CP1251 ru_RU.UTF-8
sudo update-locale LANG=en_US.UTF-8
else
sudo dnf install -y wget maven unzip openssh-server iproute sudo \
# Disable broken repos that may exist in the base image (e.g. hpc-common)
for repo in hpc-common; do
sudo dnf config-manager --set-disabled "$repo" 2>/dev/null || true
done
sudo dnf install -y wget maven unzip openssh-server iproute sudo psmisc \
java-11-openjdk-headless java-1.8.0-openjdk-headless \
glibc-langpack-en glibc-locale-source
sudo localedef -c -i en_US -f UTF-8 en_US.UTF-8 || true
Expand Down Expand Up @@ -263,7 +273,7 @@ configure_pxf() {
log "configure PXF"
source "${COMMON_SCRIPTS}/pxf-env.sh"
export PATH="$PXF_HOME/bin:$PATH"
export PXF_JVM_OPTS="-Xmx512m -Xms256m"
export PXF_JVM_OPTS="-Xmx512m -Xms256m -Duser.timezone=UTC"
export PXF_HOST=localhost
echo "JAVA_HOME=${JAVA_BUILD}" >> "$PXF_BASE/conf/pxf-env.sh"
sed -i 's/# server.address=localhost/server.address=0.0.0.0/' "$PXF_BASE/conf/pxf-application.properties"
Expand Down Expand Up @@ -430,16 +440,57 @@ wait_for_datanode() {
# Stop any zombie DataNode processes
pkill -f "proc_datanode" 2>/dev/null || true
sleep 2
# Force-release DataNode ports
for port in 50010 50020 50075 50080; do
fuser -k ${port}/tcp 2>/dev/null || true
done
sleep 3
# Restart DataNode via the singlecluster script
"${GPHD_ROOT}/bin/hadoop-datanode.sh" start 0 2>&1 || true
"${HADOOP_ROOT}/sbin/hadoop-daemon.sh" --config "${GPHD_ROOT}/storage/hadoop/datanode0/etc/hadoop" start datanode 2>&1 || true
log "DataNode restart issued, waiting again..."
fi
done

die "HDFS DataNode failed to start after ${max_attempts} attempts. Tez upload will fail without a running DataNode."
}

wait_for_hbase() {
log "waiting for HBase RegionServer to become available..."
local max_wait=60
for i in $(seq 1 ${max_wait}); do
if pgrep -f HRegionServer >/dev/null 2>&1; then
log "HBase RegionServer is running (after ${i}s), waiting 10s for stabilization..."
sleep 10
if pgrep -f HRegionServer >/dev/null 2>&1; then
log "HBase RegionServer is stable"
return 0
fi
log "HBase RegionServer died during stabilization"
break
fi
sleep 1
done
# RegionServer didn't come up or crashed; try restarting HBase once
log "HBase RegionServer not stable, attempting restart..."
${GPHD_ROOT}/bin/stop-hbase.sh 2>/dev/null || true
sleep 2
${GPHD_ROOT}/bin/start-hbase.sh 2>/dev/null || true
for i in $(seq 1 60); do
if pgrep -f HRegionServer >/dev/null 2>&1; then
log "HBase RegionServer is running after restart (after ${i}s), waiting 10s..."
sleep 10
if pgrep -f HRegionServer >/dev/null 2>&1; then
log "HBase RegionServer is stable after restart"
return 0
fi
log "WARN: HBase RegionServer died again during stabilization, continuing anyway"
return 0
fi
sleep 1
done
log "WARN: HBase RegionServer failed to start after restart, continuing anyway"
}

prepare_hadoop_stack() {
log "prepare Hadoop/Hive/HBase stack"
export JAVA_HOME="${JAVA_HADOOP}"
Expand Down Expand Up @@ -468,6 +519,13 @@ prepare_hadoop_stack() {
log "initializing HDFS namenode..."
${GPHD_ROOT}/bin/init-gphd.sh 2>&1 || log "init-gphd.sh failed with exit code $?"
fi
# Force-release DataNode ports before starting HDFS to prevent BindException.
# On CI re-runs or slow runners, stale sockets/processes may hold these ports.
log "ensuring DataNode ports are free..."
for port in 50010 50020 50075 50080; do
fuser -k ${port}/tcp 2>/dev/null || true
done
sleep 1
log "starting HDFS/YARN/HBase via start-gphd.sh..."
if ! ${GPHD_ROOT}/bin/start-gphd.sh 2>&1; then
log "start-gphd.sh returned non-zero (services may already be running), continue"
Expand All @@ -482,6 +540,7 @@ prepare_hadoop_stack() {
if ! ${GPHD_ROOT}/bin/start-hbase.sh; then
log "start-hbase.sh returned non-zero (services may already be running), continue"
fi
wait_for_hbase
start_hive_services
}

Expand Down
34 changes: 31 additions & 3 deletions ci/docker/pxf-cbdb-dev/common/script/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
# --------------------------------------------------------------------
set -euo pipefail

# Ensure UTC timezone (see entrypoint.sh for rationale)
export TZ=UTC

# Run automation tests only (assumes build/env already prepared)

# Use a unique var name to avoid clobbering by sourced env scripts
Expand Down Expand Up @@ -90,6 +93,28 @@ health_check_with_retry() {
fi
}

mvn_with_retry() {
local max_attempts=3
for attempt in $(seq 1 ${max_attempts}); do
if mvn "$@"; then
return 0
fi
if [ "${attempt}" -lt "${max_attempts}" ]; then
echo "[run_tests] Maven failed (attempt ${attempt}/${max_attempts}), retrying in 10s..."
sleep 10
fi
done
echo "[run_tests] Maven failed after ${max_attempts} attempts"
return 1
}

resolve_maven_dependencies() {
echo "[run_tests] Pre-resolving Maven dependencies..."
pushd "${REPO_ROOT}/automation" >/dev/null
mvn_with_retry -B -q dependency:resolve -DskipTests 2>&1 || echo "[warn] Maven dependency resolution failed, tests may fail"
popd >/dev/null
}

cleanup_hdfs_test_data() {
hdfs dfs -rm -r -f /gpdb-ud-scratch/tmp/pxf_automation_data >/dev/null 2>&1 || true
}
Expand Down Expand Up @@ -526,7 +551,7 @@ ensure_testplugin_jar() {
export PXF_HOME=${PXF_HOME:-/usr/local/pxf}
if [ ! -f "${PXF_BASE}/lib/pxf-automation-test.jar" ]; then
pushd "${REPO_ROOT}/automation" >/dev/null
mvn -q -DskipTests test-compile
mvn_with_retry -q -DskipTests test-compile
jar cf "${PXF_BASE}/lib/pxf-automation-test.jar" -C target/classes org/apache/cloudberry/pxf/automation/testplugin
popd >/dev/null
JAVA_HOME="${JAVA_BUILD}" "${PXF_HOME}/bin/pxf" restart >/dev/null || true
Expand Down Expand Up @@ -853,10 +878,13 @@ generate_test_summary() {
run_single_group() {
local group="$1"
echo "[run_tests] Running single test group: $group"


# Pre-resolve Maven dependencies with retry for transient network failures
resolve_maven_dependencies

# Run health check first
health_check_with_retry

ensure_testuser_pg_hba
export PGHOST=127.0.0.1
export PATH="${GPHOME}/bin:${PATH}"
Expand Down
12 changes: 8 additions & 4 deletions ci/docker/pxf-cbdb-dev/common/script/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,23 @@ check_jvm_procs() {
fi
echo "$jps_out"
echo "$jps_out" | grep -q NameNode || die "NameNode not running"
echo "$jps_out" | grep -q DataNode || die "DataNode not running"
echo "$jps_out" | grep -q DataNode || log "WARN: DataNode not running (may still be registering)"
}

check_hbase() {
local hbase_host="${HBASE_HOST:-$(hostname -I | awk '{print $1}')}"
hbase_host=${hbase_host:-127.0.0.1}

# HBase checks are non-fatal: test groups that need HBase will fail with
# clear test errors; groups that don't need HBase should not be blocked.
if ! echo "$jps_out" | grep -q HMaster && ! pgrep -f HMaster >/dev/null 2>&1; then
die "HBase HMaster not running"
log "WARN: HBase HMaster not running"
return 0
fi

if ! echo "$jps_out" | grep -q HRegionServer && ! pgrep -f HRegionServer >/dev/null 2>&1; then
die "HBase RegionServer not running"
log "WARN: HBase RegionServer not running"
return 0
fi

local hbase_ok=true
Expand All @@ -69,7 +73,7 @@ check_hbase() {
fi
if [ "${hbase_ok}" != "true" ]; then
[ -f /tmp/hbase_status.log ] && cat /tmp/hbase_status.log
die "HBase health check failed (status or port 16000 on ${hbase_host})"
log "WARN: HBase health check failed (status or port 16000 on ${hbase_host})"
fi
}

Expand Down
Loading
Loading