Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
e5bf595
fix ci
MisterRaindrop Mar 26, 2026
8acabec
fix
MisterRaindrop Mar 27, 2026
01d5b26
fix
MisterRaindrop Mar 27, 2026
52e04d2
fix
MisterRaindrop Mar 27, 2026
d53ca88
fix
MisterRaindrop Mar 27, 2026
a719511
fix: resolve Rocky 9 Parquet test failures and improve service stability
MisterRaindrop Mar 30, 2026
eb69809
fix: use correct HBase RegionServer port 60020 in wait_for_hbase
MisterRaindrop Mar 30, 2026
9c2c409
fix: simplify wait_for_hbase - remove broken /dev/tcp port check
MisterRaindrop Mar 30, 2026
a81a492
fix: preemptive port cleanup with fuser + fix pipefail bug in DataNod…
MisterRaindrop Mar 31, 2026
60c28e3
feat: add TestNG retry analyzer for transient CI test failures
MisterRaindrop Mar 31, 2026
9d19168
fix: use TestNG 6.x API getRetryAnalyzer() instead of 7.x getRetryAna…
MisterRaindrop Mar 31, 2026
3ec0a0b
fix: remove type assignment - TestNG 6.x getRetryAnalyzer() returns I…
MisterRaindrop Mar 31, 2026
fc74036
fix: register RetryListener via surefire config instead of @Listeners
MisterRaindrop Mar 31, 2026
951daee
fix: install psmisc package to provide fuser for DataNode port cleanup
MisterRaindrop Apr 1, 2026
6a7cddb
feat: enhance RetryAnalyzer with 3 retries and exponential backoff (3…
MisterRaindrop Apr 1, 2026
5c458fc
feat: cache singlecluster Docker image and use Apache official CDN
MisterRaindrop Apr 1, 2026
6d4be16
fix: persist TZ=UTC and PXF_JVM_OPTS into pxf-env.sh for pxf restart
MisterRaindrop Apr 2, 2026
4cc87f7
feat: pre-build test-ready Docker images to speed up CI
MisterRaindrop Apr 2, 2026
44d342b
fix: mkdir -p cloudberry-source before tar extract in build-test-image
MisterRaindrop Apr 2, 2026
8f0c8ef
fix: merge apt/dnf RUN layers to avoid cache invalidation in Dockerfi…
MisterRaindrop Apr 2, 2026
df0d447
fix: add USER root to Dockerfile.test-ready for apt/dnf permissions
MisterRaindrop Apr 2, 2026
4c43a72
fix: strip cloudberry/ prefix when extracting source tar for Dockerfi…
MisterRaindrop Apr 2, 2026
daa01c4
fix: move demo cluster creation to runtime (Docker build hostname mis…
MisterRaindrop Apr 2, 2026
fdb4d1b
fix: rescan SSH host keys after sshd start for Ganymed SSH authentica…
MisterRaindrop Apr 2, 2026
07a59a2
fix: fully re-configure SSH at runtime (password, crypto-policy, sshd…
MisterRaindrop Apr 2, 2026
8babe36
fix: use full setup_ssh() from original entrypoint instead of pre-bak…
MisterRaindrop Apr 2, 2026
44f4f2e
fix: skip retry for multi-block write tests that accumulate data with…
MisterRaindrop Apr 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/pxf-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,8 @@ jobs:
FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}"
SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}"

if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
exit 1
fi

Expand Down Expand Up @@ -536,8 +536,8 @@ jobs:
FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}"
SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}"

if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
exit 1
fi

Expand Down
37 changes: 37 additions & 0 deletions ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,17 @@ wait_for_datanode() {
log "Attempting to restart DataNode..."
# Stop any zombie DataNode processes
pkill -f "proc_datanode" 2>/dev/null || true
pkill -f "datanode" 2>/dev/null || true
sleep 2
# Kill any process still holding DataNode ports (50010/50020/50075)
for port in 50010 50020 50075; do
local pid
pid=$(ss -tlnp "sport = :${port}" 2>/dev/null | grep -oP 'pid=\K[0-9]+' | head -1)
if [ -n "${pid}" ]; then
log "Killing process ${pid} holding port ${port}"
kill -9 "${pid}" 2>/dev/null || true
fi
done
sleep 2
# Restart DataNode via the singlecluster script
"${GPHD_ROOT}/bin/hadoop-datanode.sh" start 0 2>&1 || true
Expand All @@ -440,6 +451,31 @@ wait_for_datanode() {
die "HDFS DataNode failed to start after ${max_attempts} attempts. Tez upload will fail without a running DataNode."
}

wait_for_hbase() {
log "waiting for HBase RegionServer to become available..."
local max_wait=60
for i in $(seq 1 ${max_wait}); do
if pgrep -f HRegionServer >/dev/null 2>&1; then
log "HBase RegionServer is running (after ${i}s)"
return 0
fi
sleep 1
done
# RegionServer didn't come up; try restarting HBase once
log "HBase RegionServer not found after ${max_wait}s, attempting restart..."
${GPHD_ROOT}/bin/stop-hbase.sh 2>/dev/null || true
sleep 2
${GPHD_ROOT}/bin/start-hbase.sh 2>/dev/null || true
for i in $(seq 1 60); do
if pgrep -f HRegionServer >/dev/null 2>&1; then
log "HBase RegionServer is running after restart (after ${i}s)"
return 0
fi
sleep 1
done
die "HBase RegionServer failed to start after restart"
}

prepare_hadoop_stack() {
log "prepare Hadoop/Hive/HBase stack"
export JAVA_HOME="${JAVA_HADOOP}"
Expand Down Expand Up @@ -482,6 +518,7 @@ prepare_hadoop_stack() {
if ! ${GPHD_ROOT}/bin/start-hbase.sh; then
log "start-hbase.sh returned non-zero (services may already be running), continue"
fi
wait_for_hbase
start_hive_services
}

Expand Down
44 changes: 28 additions & 16 deletions ci/singlecluster/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,8 @@ ENV ZOOKEEPER_SHA512="0e5a64713abc6f36d961dd61a06f681868171a9d9228366e512a013248
ENV HBASE_SHA512="1032521025660daa70260cdc931f52a26c87596be444451fe1fa88b526ede55e9d6b4220e91ff6f7422bec11f30d64fa6745e95a9c36971fdb1a264a2c745693"
ENV TEZ_SHA512="a2d94bd9fa778d42a8bac9d9da8e263e469ddfef93968b06434716554995f490231de5607541ac236e770aa0158b64250c38bc1cd57dbfa629fea705f2ffa2f5"

# faster mirror:
ENV APACHE_MIRROR="repo.huaweicloud.com/apache"
#ENV APACHE_MIRROR="archive.apache.org/dist/"
#ENV APACHE_MIRROR="mirror.yandex.ru/mirrors/apache/"

ENV HADOOP_URL="https://$APACHE_MIRROR/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz"
ENV HIVE_URL="https://$APACHE_MIRROR/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz"
ENV ZOOKEEPER_URL="https://$APACHE_MIRROR/zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz"
ENV HBASE_URL="https://$APACHE_MIRROR/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz"
ENV TEZ_URL="https://$APACHE_MIRROR/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz"
# Mirror list: try fast mirrors first, fall back to official archive
ENV APACHE_MIRRORS="repo.huaweicloud.com/apache archive.apache.org/dist"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that we can cache singlecluster image in github action's cache for 7 days... And we will not see this issue often.


ENV GPHD_ROOT=/home/gpadmin/workspace/singlecluster
ENV HADOOP_ROOT=$GPHD_ROOT/hadoop
Expand All @@ -68,34 +60,54 @@ ENV HIVE_ROOT=$GPHD_ROOT/hive
ENV ZOOKEEPER_ROOT=$GPHD_ROOT/zookeeper
ENV TEZ_ROOT=$GPHD_ROOT/tez

# Helper: download from first working mirror with retry
# Usage: apache_download <relative_path> <output_file>
RUN sudo tee /usr/local/bin/apache_download.sh > /dev/null <<'DLEOF' && sudo chmod +x /usr/local/bin/apache_download.sh
#!/bin/bash
set -e
rel_path="$1"; output="$2"
for mirror in $APACHE_MIRRORS; do
url="https://${mirror}/${rel_path}"
echo "Trying: $url"
if curl -fSL --retry 2 --retry-delay 3 --connect-timeout 15 "$url" -o "$output" 2>&1; then
echo "Downloaded from $mirror"
exit 0
fi
echo "Failed from $mirror, trying next..."
rm -f "$output"
done
echo "ERROR: all mirrors failed for $rel_path"
exit 1
DLEOF

RUN mkdir -p $HADOOP_ROOT && \
curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz && \
apache_download.sh "hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" /tmp/hadoop.tar.gz && \
echo "$HADOOP_SHA512 /tmp/hadoop.tar.gz" | sha512sum -c && \
tar xvf /tmp/hadoop.tar.gz -C $HADOOP_ROOT --strip-components 1 --exclude="share/doc/*" --exclude="*-sources.jar" && \
rm /tmp/hadoop.tar.gz && \
curl -fSL "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \
curl -fSL --retry 2 "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \
-o $HADOOP_ROOT/share/hadoop/common/lib/javax.activation-api-1.2.0.jar

RUN mkdir -p $HIVE_ROOT && \
curl -fSL $HIVE_URL -o /tmp/hive.tar.gz && \
apache_download.sh "hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz" /tmp/hive.tar.gz && \
echo "$HIVE_SHA256 /tmp/hive.tar.gz" | sha256sum -c && \
tar xvf /tmp/hive.tar.gz -C $HIVE_ROOT --strip-components 1 && \
rm /tmp/hive.tar.gz

RUN mkdir -p $ZOOKEEPER_ROOT && \
curl -fSL $ZOOKEEPER_URL -o /tmp/zookeeper.tar.gz && \
apache_download.sh "zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz" /tmp/zookeeper.tar.gz && \
echo "$ZOOKEEPER_SHA512 /tmp/zookeeper.tar.gz" | sha512sum -c && \
tar xvf /tmp/zookeeper.tar.gz -C $ZOOKEEPER_ROOT --strip-components 1 --exclude="docs/*" && \
rm /tmp/zookeeper.tar.gz

RUN mkdir -p $HBASE_ROOT && \
curl -fSL "$HBASE_URL" -o /tmp/hbase.tar.gz && \
apache_download.sh "hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz" /tmp/hbase.tar.gz && \
echo "$HBASE_SHA512 /tmp/hbase.tar.gz" | sha512sum -c && \
tar xvf /tmp/hbase.tar.gz -C $HBASE_ROOT --strip-components 1 --exclude="docs/*" --exclude="lib/*-tests.jar" --exclude="lib/shaded-clients" && \
rm /tmp/hbase.tar.gz

RUN mkdir -p $TEZ_ROOT && \
curl -fSL "$TEZ_URL" -o /tmp/tez.tar.gz && \
apache_download.sh "tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz" /tmp/tez.tar.gz && \
echo "$TEZ_SHA512 /tmp/tez.tar.gz" | sha512sum -c && \
tar xvf /tmp/tez.tar.gz -C $TEZ_ROOT --strip-components 1 && \
rm /tmp/tez.tar.gz
Expand Down
22 changes: 16 additions & 6 deletions server/gradlew-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,23 @@ if [ ! -e "${GRADLE_WRAPPER_JAR}" ]; then
# The Gradle version extracted from the `distributionUrl` property does not contain ".0" patch
# versions. Need to append a ".0" in that case to download the wrapper jar.
GRADLE_VERSION="$(echo "$GRADLE_DIST_VERSION" | sed 's/^\([0-9]*[.][0-9]*\)$/\1.0/')"
curl --location --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || exit 1
JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)"
EXPECTED="$(cat "${GRADLE_WRAPPER_SHA256}")"
if [ "${JAR_CHECKSUM}" != "${EXPECTED}" ]; then
# If the (just downloaded) checksum and the downloaded wrapper jar do not match, something
# really bad is going on.
MAX_RETRIES=3
for _retry in $(seq 1 ${MAX_RETRIES}); do
curl --location --fail --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || {
echo "Download attempt ${_retry}/${MAX_RETRIES} failed (curl error)" > /dev/stderr
rm -f "${GRADLE_WRAPPER_JAR}"
if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi
exit 1
}
JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)"
if [ "${JAR_CHECKSUM}" = "${EXPECTED}" ]; then
break
fi
echo "SHA256 mismatch on attempt ${_retry}/${MAX_RETRIES} (got ${JAR_CHECKSUM}, expected ${EXPECTED})" > /dev/stderr
rm -f "${GRADLE_WRAPPER_JAR}"
if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi
echo "Expected sha256 of the downloaded gradle-wrapper.jar does not match the downloaded sha256!" > /dev/stderr
exit 1
fi
done
fi
Loading