diff --git a/.github/workflows/behave-cloudberry.yml b/.github/workflows/behave-cloudberry.yml new file mode 100644 index 00000000000..9df8a8e9f23 --- /dev/null +++ b/.github/workflows/behave-cloudberry.yml @@ -0,0 +1,737 @@ +# +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -------------------------------------------------------------------- +# GitHub Actions Workflow: Apache Cloudberry Behave Pipeline +# -------------------------------------------------------------------- +# Description: +# +# This workflow runs Apache Cloudberry gpMgmt Behave tests on Rocky Linux 9. +# It is intentionally separated from the main build/installcheck workflow so +# that Behave-specific matrix expansion, environment setup, result parsing, +# and iterative test stabilization do not disturb the primary CI path. +# +# Workflow Overview: +# 1. **Prepare Behave Matrix**: +# - Expands the selected Behave command-level test matrix. +# - Supports manual filtering through `test_selection`. +# +# 2. **Build Job**: +# - Builds Apache Cloudberry and creates source/RPM artifacts for reuse +# within this workflow. +# +# 3. **Behave Job (Matrix)**: +# - Creates a demo cluster for each Behave matrix entry. +# - Runs the selected gpMgmt feature file(s) in isolation. +# - Parses Behave summaries and uploads logs/metadata artifacts. +# +# 4. **Report Job**: +# - Aggregates build and Behave job status into a final workflow summary. +# +# Execution Environment: +# - **Runs On**: ubuntu-22.04 with Rocky Linux 9 containers. +# - **Primary Test Scope**: `gpMgmt/test/behave/mgmt_utils` +# +# Notes: +# - Trigger mode: push, pull_request, and manual `workflow_dispatch`. +# - Behave tests are split by command to reduce cross-feature environment +# pollution. +# - This workflow currently focuses on single-host CI-compatible Behave tests. +# - Logs and parsed summaries are uploaded as artifacts for each matrix entry. +# -------------------------------------------------------------------- + +name: Apache Cloudberry Behave + +on: + push: + branches: [main, REL_2_STABLE] + pull_request: + branches: [main, REL_2_STABLE] + types: [opened, synchronize, reopened, edited] + workflow_dispatch: + inputs: + test_selection: + description: 'Select Behave tests to run (comma-separated). Examples: ic-behave-gpconfig,ic-behave-gpstart' + required: false + default: 'all' + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + +permissions: + contents: read + packages: read + actions: write + checks: read + pull-requests: read + +env: + LOG_RETENTION_DAYS: 7 + ENABLE_DEBUG: false + +jobs: + prepare-behave-matrix: + runs-on: ubuntu-22.04 + outputs: + behave-matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - id: set-matrix + run: | + echo "=== Behave Matrix Preparation Diagnostics ===" + echo "Event type: ${{ github.event_name }}" + echo "Test selection input: '${{ github.event.inputs.test_selection || 'all' }}'" + + ALL_BEHAVE_TESTS='{ + "include": [ + {"test":"ic-behave-analyzedb","behave_features":["test/behave/mgmt_utils/analyzedb.feature"]}, + {"test":"ic-behave-gp-bash-functions","behave_features":["test/behave/mgmt_utils/gp_bash_functions.feature"]}, + {"test":"ic-behave-gpactivatestandby","behave_features":["test/behave/mgmt_utils/gpactivatestandby.feature"]}, + {"test":"ic-behave-gpaddmirrors", + "behave_features":["test/behave/mgmt_utils/gpaddmirrors.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpcheckcat", + "behave_features":["test/behave/mgmt_utils/gpcheckcat.feature"], + "behave_args":"--tags ~@extended" + }, + {"test":"ic-behave-gpcheckperf", + "behave_features":["test/behave/mgmt_utils/gpcheckperf.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpconfig","behave_features":["test/behave/mgmt_utils/gpconfig.feature"]}, + {"test":"ic-behave-gpinitstandby", + "behave_features":["test/behave/mgmt_utils/gpinitstandby.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpinitsystem", + "behave_features":["test/behave/mgmt_utils/gpinitsystem.feature"], + "behave_args":"--tags ~@extended" + }, + {"test":"ic-behave-gpmovemirrors", + "behave_features":["test/behave/mgmt_utils/gpmovemirrors.feature"], + "behave_args":"--tags ~@concourse_cluster --tags ~@extended" + }, + {"test":"ic-behave-gprecoverseg", + "behave_features":["test/behave/mgmt_utils/gprecoverseg.feature"], + "behave_args":"--tags ~@concourse_cluster --tags ~@extended" + }, + {"test":"ic-behave-gpreload","behave_features":["test/behave/mgmt_utils/gpreload.feature"]}, + {"test":"ic-behave-gpstart", + "behave_features":["test/behave/mgmt_utils/gpstart.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpstate", + "behave_features":["test/behave/mgmt_utils/gpstate.feature"], + "behave_args":"--tags ~@concourse_cluster" + }, + {"test":"ic-behave-gpstop","behave_features":["test/behave/mgmt_utils/gpstop.feature"]}, + {"test":"ic-behave-gpssh", + "behave_features":["test/behave/mgmt_utils/gpssh.feature"], + "behave_args":"--tags ~@requires_netem" + }, + {"test":"ic-behave-minirepro","behave_features":["test/behave/mgmt_utils/minirepro.feature"]}, + {"test":"ic-behave-replication-slots", + "behave_features":["test/behave/mgmt_utils/replication_slots.feature"], + "behave_args":"--tags ~@extended" + } + ] + }' + + VALID_TESTS=$(echo "$ALL_BEHAVE_TESTS" | jq -r '.include[].test') + IFS=',' read -ra SELECTED_TESTS <<< "${{ github.event.inputs.test_selection || 'all' }}" + + if [[ "${SELECTED_TESTS[*]}" == "all" || -z "${SELECTED_TESTS[*]}" ]]; then + mapfile -t SELECTED_TESTS <<< "$VALID_TESTS" + fi + + INVALID_TESTS=() + FILTERED_TESTS=() + for TEST in "${SELECTED_TESTS[@]}"; do + TEST=$(echo "$TEST" | tr -d '[:space:]') + if echo "$VALID_TESTS" | grep -qw "$TEST"; then + FILTERED_TESTS+=("$TEST") + else + INVALID_TESTS+=("$TEST") + fi + done + + if [[ ${#INVALID_TESTS[@]} -gt 0 ]]; then + echo "::error::Invalid Behave test(s) selected: ${INVALID_TESTS[*]}" + echo "Valid tests are: $(echo "$VALID_TESTS" | tr '\n' ', ')" + exit 1 + fi + + RESULT='{"include":[' + FIRST=true + for TEST in "${FILTERED_TESTS[@]}"; do + CONFIG=$(jq -c --arg test "$TEST" '.include[] | select(.test == $test)' <<< "$ALL_BEHAVE_TESTS") + if [[ "$FIRST" == true ]]; then + FIRST=false + else + RESULT="${RESULT}," + fi + RESULT="${RESULT}${CONFIG}" + done + RESULT="${RESULT}]}" + + echo "Final behave matrix configuration:" + echo "$RESULT" | jq . + + { + echo "matrix<> "$GITHUB_OUTPUT" + + build: + name: Build Apache Cloudberry RPM + env: + JOB_TYPE: build + runs-on: ubuntu-22.04 + timeout-minutes: 120 + outputs: + build_timestamp: ${{ steps.set_timestamp.outputs.timestamp }} + container: + image: apache/incubator-cloudberry:cbdb-build-rocky9-latest + options: >- + --user root + -h cdw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt + steps: + - name: Free Disk Space + run: | + echo "=== Disk space before cleanup ===" + df -h / + + rm -rf /host_opt/hostedtoolcache || true + rm -rf /host_usr_local/lib/android || true + rm -rf /host_usr_share/dotnet || true + rm -rf /host_opt/ghc || true + rm -rf /host_usr_local/.ghcup || true + rm -rf /host_usr_share/swift || true + rm -rf /host_usr_local/share/powershell || true + rm -rf /host_usr_local/share/chromium || true + rm -rf /host_usr_share/miniconda || true + rm -rf /host_opt/az || true + rm -rf /host_usr_share/sbt || true + + echo "=== Disk space after cleanup ===" + df -h / + + - name: Set build timestamp + id: set_timestamp + run: | + timestamp=$(date +'%Y%m%d_%H%M%S') + echo "timestamp=$timestamp" | tee -a "$GITHUB_OUTPUT" + echo "BUILD_TIMESTAMP=$timestamp" | tee -a "$GITHUB_ENV" + + - name: Checkout Apache Cloudberry + uses: actions/checkout@v4 + with: + fetch-depth: 1 + submodules: true + + - name: Cloudberry Environment Initialization + env: + LOGS_DIR: build-logs + run: | + set -eo pipefail + if ! su - gpadmin -c "/tmp/init_system.sh"; then + echo "::error::Container initialization failed" + exit 1 + fi + + mkdir -p "${LOGS_DIR}/details" + chown -R gpadmin:gpadmin . + chmod -R 755 . + chmod 777 "${LOGS_DIR}" + + df -kh / + rm -rf /__t/* + df -kh / + + df -h | tee -a "${LOGS_DIR}/details/disk-usage.log" + free -h | tee -a "${LOGS_DIR}/details/memory-usage.log" + + { + echo "=== Environment Information ===" + uname -a + df -h + free -h + env + } | tee -a "${LOGS_DIR}/details/environment.log" + + echo "SRC_DIR=${GITHUB_WORKSPACE}" | tee -a "$GITHUB_ENV" + + - name: Run Apache Cloudberry configure script + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ENABLE_DEBUG=${{ env.ENABLE_DEBUG }} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh"; then + echo "::error::Configure script failed" + exit 1 + fi + + - name: Run Apache Cloudberry build script + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/build-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/build-cloudberry.sh"; then + echo "::error::Build script failed" + exit 1 + fi + + - name: Create Source tarball, create RPM and verify artifacts + env: + CBDB_VERSION: 99.0.0 + BUILD_NUMBER: 1 + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + tar czf "${SRC_DIR}"/../apache-cloudberry-incubating-src.tgz -C "${SRC_DIR}"/.. ./cloudberry + mv "${SRC_DIR}"/../apache-cloudberry-incubating-src.tgz "${SRC_DIR}" + + rpmdev-setuptree + ln -s "${SRC_DIR}"/devops/build/packaging/rpm/apache-cloudberry-db-incubating.spec "${HOME}"/rpmbuild/SPECS/apache-cloudberry-db-incubating.spec + cp "${SRC_DIR}"/LICENSE /usr/local/cloudberry-db + + DEBUG_RPMBUILD_OPT="" + DEBUG_IDENTIFIER="" + if [ "${{ env.ENABLE_DEBUG }}" = "true" ]; then + DEBUG_RPMBUILD_OPT="--with-debug" + DEBUG_IDENTIFIER=".debug" + fi + + "${SRC_DIR}"/devops/build/packaging/rpm/build-rpm.sh --version "${CBDB_VERSION}" --release "${BUILD_NUMBER}" "${DEBUG_RPMBUILD_OPT}" + + os_version=$(grep -oP '(?<=^VERSION_ID=")[0-9]' /etc/os-release) + RPM_FILE="${HOME}"/rpmbuild/RPMS/x86_64/apache-cloudberry-db-incubating-"${CBDB_VERSION}"-"${BUILD_NUMBER}""${DEBUG_IDENTIFIER}".el"${os_version}".x86_64.rpm + cp "${RPM_FILE}" "${SRC_DIR}" + RPM_DEBUG="${HOME}"/rpmbuild/RPMS/x86_64/apache-cloudberry-db-incubating-debuginfo-"${CBDB_VERSION}"-"${BUILD_NUMBER}""${DEBUG_IDENTIFIER}".el"${os_version}".x86_64.rpm + cp "${RPM_DEBUG}" "${SRC_DIR}" + + - name: Upload build logs + uses: actions/upload-artifact@v4 + with: + name: behave-build-logs-${{ env.BUILD_TIMESTAMP }} + path: | + build-logs/ + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + - name: Upload Cloudberry RPM build artifacts + uses: actions/upload-artifact@v4 + with: + name: apache-cloudberry-db-incubating-rpm-build-artifacts + retention-days: ${{ env.LOG_RETENTION_DAYS }} + if-no-files-found: error + path: | + *.rpm + + - name: Upload Cloudberry source build artifacts + uses: actions/upload-artifact@v4 + with: + name: apache-cloudberry-db-incubating-source-build-artifacts + retention-days: ${{ env.LOG_RETENTION_DAYS }} + if-no-files-found: error + path: | + apache-cloudberry-incubating-src.tgz + + behave: + name: ${{ matrix.test }} + needs: [build, prepare-behave-matrix] + if: | + !cancelled() && + needs.build.result == 'success' + runs-on: ubuntu-22.04 + timeout-minutes: 120 + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.prepare-behave-matrix.outputs.behave-matrix) }} + container: + image: apache/incubator-cloudberry:cbdb-build-rocky9-latest + options: >- + --privileged + --user root + --hostname cdw + --shm-size=2gb + --ulimit core=-1 + --cgroupns=host + -v /sys/fs/cgroup:/sys/fs/cgroup:rw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt + steps: + - name: Free Disk Space + run: | + echo "=== Disk space before cleanup ===" + df -h / + + rm -rf /host_opt/hostedtoolcache || true + rm -rf /host_usr_local/lib/android || true + rm -rf /host_usr_share/dotnet || true + rm -rf /host_opt/ghc || true + rm -rf /host_usr_local/.ghcup || true + rm -rf /host_usr_share/swift || true + rm -rf /host_usr_local/share/powershell || true + rm -rf /host_usr_local/share/chromium || true + rm -rf /host_usr_share/miniconda || true + rm -rf /host_opt/az || true + rm -rf /host_usr_share/sbt || true + + echo "=== Disk space after cleanup ===" + df -h / + + - name: Cloudberry Environment Initialization + env: + LOGS_DIR: build-logs + run: | + set -eo pipefail + if ! su - gpadmin -c "/tmp/init_system.sh"; then + echo "::error::Container initialization failed" + exit 1 + fi + + mkdir -p "${LOGS_DIR}/details" + chown -R gpadmin:gpadmin . + chmod -R 755 . + chmod 777 "${LOGS_DIR}" + + df -kh / + rm -rf /__t/* + df -kh / + + df -h | tee -a "${LOGS_DIR}/details/disk-usage.log" + free -h | tee -a "${LOGS_DIR}/details/memory-usage.log" + + { + echo "=== Environment Information ===" + uname -a + df -h + free -h + env + } | tee -a "${LOGS_DIR}/details/environment.log" + + echo "SRC_DIR=${GITHUB_WORKSPACE}" | tee -a "$GITHUB_ENV" + + - name: Generate Behave Job Summary Start + if: always() + run: | + { + echo "# Behave Job Summary: ${{ matrix.test }}" + echo "## Environment" + echo "- Start Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + echo "- OS Version: $(cat /etc/redhat-release)" + } >> "$GITHUB_STEP_SUMMARY" + + - name: Download Cloudberry RPM build artifacts + uses: actions/download-artifact@v4 + with: + name: apache-cloudberry-db-incubating-rpm-build-artifacts + path: ${{ github.workspace }}/rpm_build_artifacts + merge-multiple: false + run-id: ${{ github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download Cloudberry Source build artifacts + uses: actions/download-artifact@v4 + with: + name: apache-cloudberry-db-incubating-source-build-artifacts + path: ${{ github.workspace }}/source_build_artifacts + merge-multiple: false + run-id: ${{ github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Verify downloaded artifacts + id: verify-artifacts + run: | + set -eo pipefail + + SRC_TARBALL_FILE=$(ls "${GITHUB_WORKSPACE}"/source_build_artifacts/apache-cloudberry-incubating-src.tgz) + if [ ! -f "${SRC_TARBALL_FILE}" ]; then + echo "::error::SRC TARBALL file not found" + exit 1 + fi + echo "src_tarball_file=${SRC_TARBALL_FILE}" >> "$GITHUB_OUTPUT" + + RPM_FILE=$(ls "${GITHUB_WORKSPACE}"/rpm_build_artifacts/apache-cloudberry-db-incubating-[0-9]*.rpm | grep -v "debuginfo") + if [ ! -f "${RPM_FILE}" ]; then + echo "::error::RPM file not found" + exit 1 + fi + echo "rpm_file=${RPM_FILE}" >> "$GITHUB_OUTPUT" + + - name: Install Cloudberry RPM + if: success() + env: + RPM_FILE: ${{ steps.verify-artifacts.outputs.rpm_file }} + run: | + set -eo pipefail + + dnf clean all + dnf makecache --refresh || dnf makecache + rm -rf /usr/local/cloudberry-db + + if ! time dnf install -y --setopt=retries=10 --releasever=9 "${RPM_FILE}"; then + echo "::error::RPM installation failed" + exit 1 + fi + + rm -rf "${GITHUB_WORKSPACE}"/rpm_build_artifacts + + - name: Extract source tarball + if: success() + env: + SRC_TARBALL_FILE: ${{ steps.verify-artifacts.outputs.src_tarball_file }} + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + if ! time tar zxf "${SRC_TARBALL_FILE}" -C "${SRC_DIR}"/.. ; then + echo "::error::Source extraction failed" + exit 1 + fi + + rm -rf "${GITHUB_WORKSPACE}"/source_build_artifacts + + - name: Create Apache Cloudberry demo cluster + if: success() + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh + + if ! time su - gpadmin -c "cd ${SRC_DIR} && NUM_PRIMARY_MIRROR_PAIRS='3' SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh"; then + echo "::error::Demo cluster creation failed" + exit 1 + fi + + - name: Run Behave Tests + if: success() + env: + SRC_DIR: ${{ github.workspace }} + shell: bash {0} + run: | + set -o pipefail + + mkdir -p build-logs/details + config_log="build-logs/details/make-${{ matrix.test }}-config0.log" + behave_targets="${{ join(matrix.behave_features, ' ') }}" + behave_args="${{ matrix.behave_args || '' }}" + + mkdir -p "/tmp/cloudberry-cores" + chmod 1777 "/tmp/cloudberry-cores" + sysctl -w kernel.core_pattern="/tmp/cloudberry-cores/core-%e-%s-%u-%g-%p-%t" + + dnf install -y libffi-devel || echo "Warning: failed to install libffi-devel" + su - gpadmin -c "pip3 install --user -r ${SRC_DIR}/gpMgmt/requirements-dev.txt || pip install --user -r ${SRC_DIR}/gpMgmt/requirements-dev.txt" + + echo "Running features:" + for feature in $behave_targets; do + echo "- $feature" + done + if [[ -n "$behave_args" ]]; then + echo "Behave args: $behave_args" + fi + + if ! time su - gpadmin -c "cd ${SRC_DIR}/gpMgmt && source /usr/local/cloudberry-db/cloudberry-env.sh && source ${SRC_DIR}/gpAux/gpdemo/gpdemo-env.sh && PYTHONPATH=${SRC_DIR}/gpMgmt:\$PYTHONPATH behave $behave_args $behave_targets" \ + 2>&1 | tee -a "$config_log"; then + echo "::warning::Behave execution reported failures" + exit 1 + fi + + - name: Parse Behave Results + if: always() + shell: bash {0} + run: | + set -o pipefail + + config_log="build-logs/details/make-${{ matrix.test }}-config0.log" + behave_cmd="behave ${{ matrix.behave_args || '' }} ${{ join(matrix.behave_features, ' ') }}" + if [ ! -f "$config_log" ]; then + { + echo "MAKE_COMMAND=\"${behave_cmd}\"" + echo "STATUS=missing_log" + echo "TOTAL_TESTS=0" + echo "FAILED_TESTS=0" + echo "PASSED_TESTS=0" + echo "IGNORED_TESTS=0" + } | tee "test_results.${{ matrix.test }}.0.txt" + exit 1 + fi + + features_line=$(grep -E '^[0-9]+ feature(s)? passed, [0-9]+ failed, [0-9]+ skipped$' "$config_log" | tail -n 1) + scenarios_line=$(grep -E '^[0-9]+ scenario(s)? passed, [0-9]+ failed, [0-9]+ skipped(, [0-9]+ untested)?$' "$config_log" | tail -n 1) + steps_line=$(grep -E '^[0-9]+ step(s)? passed, [0-9]+ failed, [0-9]+ skipped, [0-9]+ undefined(, [0-9]+ untested)?$' "$config_log" | tail -n 1) + + if [[ -z "$scenarios_line" ]]; then + { + echo "MAKE_COMMAND=\"${behave_cmd}\"" + echo "STATUS=parse_error" + echo "TOTAL_TESTS=0" + echo "FAILED_TESTS=0" + echo "PASSED_TESTS=0" + echo "IGNORED_TESTS=0" + } | tee "test_results.${{ matrix.test }}.0.txt" + exit 1 + fi + + scenario_counts=$(echo "$scenarios_line" | sed -E 's/^([0-9]+) scenario(s)? passed, ([0-9]+) failed, ([0-9]+) skipped(, ([0-9]+) untested)?$/\1 \3 \4 \6/') + read -r scenarios_passed scenarios_failed scenarios_skipped scenarios_untested <<< "$scenario_counts" + scenarios_untested=${scenarios_untested:-0} + total_scenarios=$((scenarios_passed + scenarios_failed + scenarios_skipped)) + + { + echo "MAKE_COMMAND=\"${behave_cmd}\"" + if [[ "$scenarios_failed" -eq 0 ]]; then + echo "STATUS=passed" + else + echo "STATUS=failed" + fi + echo "TOTAL_TESTS=${total_scenarios}" + echo "FAILED_TESTS=${scenarios_failed}" + echo "PASSED_TESTS=${scenarios_passed}" + echo "IGNORED_TESTS=${scenarios_skipped}" + echo "BEHAVE_UNTESTED_SCENARIOS=${scenarios_untested}" + echo "BEHAVE_FEATURES_SUMMARY=\"${features_line:-unavailable}\"" + echo "BEHAVE_SCENARIOS_SUMMARY=\"${scenarios_line}\"" + echo "BEHAVE_STEPS_SUMMARY=\"${steps_line:-unavailable}\"" + } | tee "test_results.${{ matrix.test }}.0.txt" + + if [[ "$scenarios_failed" -eq 0 ]]; then + exit 0 + fi + exit 1 + + - name: Generate Behave Job Summary End + if: always() + shell: bash {0} + run: | + { + echo "## Test Results" + echo "- End Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + + if [[ ! -f "test_results.${{ matrix.test }}.0.txt" ]]; then + echo "### Result Status" + echo "⚠️ No results file found" + exit 0 + fi + + . "test_results.${{ matrix.test }}.0.txt" + + echo "### Command" + echo "\`$MAKE_COMMAND\`" + echo "" + + echo "### Status" + case "${STATUS:-unknown}" in + passed) + echo "✅ All scenarios passed" + ;; + failed) + echo "❌ Some scenarios failed" + ;; + parse_error) + echo "⚠️ Could not parse Behave results" + ;; + missing_log) + echo "⚠️ Behave log file missing" + ;; + *) + echo "⚠️ Unknown status: ${STATUS:-unknown}" + ;; + esac + + echo "" + echo "### Scenario Counts" + echo "| Metric | Count |" + echo "|--------|-------|" + echo "| Total Scenarios | ${TOTAL_TESTS:-0} |" + echo "| Passed Scenarios | ${PASSED_TESTS:-0} |" + echo "| Failed Scenarios | ${FAILED_TESTS:-0} |" + echo "| Skipped Scenarios | ${IGNORED_TESTS:-0} |" + echo "| Untested Scenarios | ${BEHAVE_UNTESTED_SCENARIOS:-0} |" + + echo "" + echo "### Behave Summary" + echo "| Metric | Summary |" + echo "|--------|---------|" + echo "| Features | ${BEHAVE_FEATURES_SUMMARY:-unavailable} |" + echo "| Scenarios | ${BEHAVE_SCENARIOS_SUMMARY:-unavailable} |" + echo "| Steps | ${BEHAVE_STEPS_SUMMARY:-unavailable} |" + } >> "$GITHUB_STEP_SUMMARY" || true + + - name: Upload behave logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: behave-logs-${{ matrix.test }}-${{ needs.build.outputs.build_timestamp || github.run_id }} + path: | + build-logs/ + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + - name: Upload Behave Metadata + if: always() + uses: actions/upload-artifact@v4 + with: + name: behave-metadata-${{ matrix.test }} + path: | + test_results*.txt + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + report: + name: Generate Apache Cloudberry Behave Report + needs: [build, prepare-behave-matrix, behave] + if: always() + runs-on: ubuntu-22.04 + steps: + - name: Generate Final Report + run: | + { + echo "# Apache Cloudberry Behave Report" + echo "## Job Status" + echo "- Build Job: ${{ needs.build.result }}" + echo "- Behave Job: ${{ needs.behave.result }}" + echo "- Completion Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + + if [[ "${{ needs.build.result }}" == "success" && + "${{ needs.behave.result }}" =~ ^(success|skipped)$ ]]; then + echo "✅ Pipeline completed successfully" + else + echo "⚠️ Pipeline completed with failures" + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: Notify on failure + if: | + needs.build.result != 'success' || + !contains(fromJson('["success","skipped"]'), needs.behave.result) + run: | + echo "::error::Behave pipeline failed! Check job summaries and logs for details" + echo "Timestamp: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + echo "Build Result: ${{ needs.build.result }}" + echo "Behave Result: ${{ needs.behave.result }}" diff --git a/.github/workflows/build-cloudberry.yml b/.github/workflows/build-cloudberry.yml index adb57fb85ec..ef66b60d913 100644 --- a/.github/workflows/build-cloudberry.yml +++ b/.github/workflows/build-cloudberry.yml @@ -223,7 +223,6 @@ jobs: DEFAULT_ENABLE_CGROUPS=false DEFAULT_ENABLE_CORE_CHECK=true DEFAULT_PG_SETTINGS_OPTIMIZER="" - # Define base test configurations ALL_TESTS='{ "include": [ @@ -1573,8 +1572,6 @@ jobs: continue fi - # Parse this configuration's results - MAKE_NAME="${{ matrix.test }}-config$i" \ "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/parse-test-results.sh "$config_log" status_code=$? diff --git a/gpAux/gpdemo/demo_cluster.sh b/gpAux/gpdemo/demo_cluster.sh index 225bb76a5ee..7397894d359 100755 --- a/gpAux/gpdemo/demo_cluster.sh +++ b/gpAux/gpdemo/demo_cluster.sh @@ -314,8 +314,10 @@ cat >> $CLUSTER_CONFIG <<-EOF COORDINATOR_PORT=${COORDINATOR_DEMO_PORT} - # Shell to use to execute commands on all hosts - TRUSTED_SHELL="$(dirname "$0")/lalshell" + # Shell to use to execute commands on all hosts. Use an absolute path here + # because this file is later sourced by gpinitsystem, where \$0 is no longer + # demo_cluster.sh. + TRUSTED_SHELL=$(pwd)/lalshell ENCODING=UNICODE EOF diff --git a/gpMgmt/bin/analyzedb b/gpMgmt/bin/analyzedb index 48d8e16872c..082d6af8377 100755 --- a/gpMgmt/bin/analyzedb +++ b/gpMgmt/bin/analyzedb @@ -951,7 +951,10 @@ class AnalyzeDb(Operation): # Create a Command object that executes a query using psql. def create_psql_command(dbname, query): psql_cmd = """psql %s -c %s""" % (pipes.quote(dbname), pipes.quote(query)) - return Command(query, psql_cmd) + # Keep the command text intact for execution, but make the display name + # ASCII-safe so logger/output paths do not choke on UTF-8 identifiers. + safe_query_display = query.encode('ascii', 'backslashreplace').decode('ascii') + return Command(safe_query_display, psql_cmd) def run_sql(conn, query): diff --git a/gpMgmt/bin/gppylib/commands/base.py b/gpMgmt/bin/gppylib/commands/base.py index d455c6e2d13..477c0ba1a75 100755 --- a/gpMgmt/bin/gppylib/commands/base.py +++ b/gpMgmt/bin/gppylib/commands/base.py @@ -37,6 +37,12 @@ CMD_CACHE = {} + +def _safe_log_string(value): + if isinstance(value, str): + return value.encode('ascii', 'backslashreplace').decode('ascii') + return str(value) + # Maximum retries if sshd rejects the connection due to too many # unauthenticated connections. SSH_MAX_RETRY = 10 @@ -86,7 +92,7 @@ def markTaskDone(self): self.work_queue.task_done() def addCommand(self, cmd): - self.logger.debug("Adding cmd to work_queue: %s" % cmd.cmdStr) + self.logger.debug("Adding cmd to work_queue: %s" % _safe_log_string(cmd.cmdStr)) self.work_queue.put(cmd) self._assigned += 1 @@ -272,20 +278,20 @@ def run(self): self.cmd = None return elif self.pool.should_stop: - self.logger.debug("[%s] got cmd and pool is stopped: %s" % (self.name, self.cmd)) + self.logger.debug("[%s] got cmd and pool is stopped: %s" % (self.name, _safe_log_string(self.cmd))) self.pool.markTaskDone() self.cmd = None else: - self.logger.debug("[%s] got cmd: %s" % (self.name, self.cmd.cmdStr)) + self.logger.debug("[%s] got cmd: %s" % (self.name, _safe_log_string(self.cmd.cmdStr))) self.cmd.run() - self.logger.debug("[%s] finished cmd: %s" % (self.name, self.cmd)) + self.logger.debug("[%s] finished cmd: %s" % (self.name, _safe_log_string(self.cmd))) self.pool.addFinishedWorkItem(self.cmd) self.cmd = None except Exception as e: self.logger.exception(e) if self.cmd: - self.logger.debug("[%s] finished cmd with exception: %s" % (self.name, self.cmd)) + self.logger.debug("[%s] finished cmd with exception: %s" % (self.name, _safe_log_string(self.cmd))) self.pool.addFinishedWorkItem(self.cmd) self.cmd = None @@ -548,9 +554,9 @@ def __init__(self, name, cmdStr, ctxt=LOCAL, remoteHost=None, stdin=None, gphome def __str__(self): if self.results: - return "%s cmdStr='%s' had result: %s" % (self.name, self.cmdStr, self.results) + return "%s cmdStr='%s' had result: %s" % (self.name, _safe_log_string(self.cmdStr), self.results) else: - return "%s cmdStr='%s'" % (self.name, self.cmdStr) + return "%s cmdStr='%s'" % (self.name, _safe_log_string(self.cmdStr)) # Start a process that will execute the command but don't wait for # it to complete. Return the Popen object instead. @@ -559,7 +565,7 @@ def runNoWait(self): return self.exec_context.proc def run(self, validateAfter=False): - self.logger.debug("Running Command: %s" % self.cmdStr) + self.logger.debug("Running Command: %s" % _safe_log_string(self.cmdStr)) self.exec_context.execute(self, pickled=self.pickled, start_new_session=self.start_new_session) if validateAfter: diff --git a/gpMgmt/bin/gppylib/test/unit/test_unit_analyzedb.py b/gpMgmt/bin/gppylib/test/unit/test_unit_analyzedb.py new file mode 100644 index 00000000000..be3b33efd66 --- /dev/null +++ b/gpMgmt/bin/gppylib/test/unit/test_unit_analyzedb.py @@ -0,0 +1,22 @@ +import imp +import os + +from gppylib.test.unit.gp_unittest import GpTestCase, run_tests + + +class AnalyzeDbTestCase(GpTestCase): + def setUp(self): + analyzedb_file = os.path.abspath(os.path.dirname(__file__) + "/../../../analyzedb") + self.subject = imp.load_source('analyzedb', analyzedb_file) + + def test_create_psql_command_keeps_utf8_sql_but_uses_ascii_safe_display_name(self): + query = 'analyze "public"."spiegelungssätze"' + + cmd = self.subject.create_psql_command('special_encoding_db', query) + + self.assertEqual(cmd.name, 'analyze "public"."spiegelungss\\xe4tze"') + self.assertIn('spiegelungssätze', cmd.cmdStr) + + +if __name__ == '__main__': + run_tests() diff --git a/gpMgmt/test/behave/mgmt_utils/gpcheckcat.feature b/gpMgmt/test/behave/mgmt_utils/gpcheckcat.feature index d9b91838909..1fe3bb05e16 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpcheckcat.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpcheckcat.feature @@ -311,14 +311,14 @@ Feature: gpcheckcat tests Then gpcheckcat should return a return code of 3 And the user runs "dropdb fkey_ta" + @extended Scenario: gpcheckcat should report and repair extra entries with non-oid primary keys Given database "extra_pk_db" is dropped and recreated And the path "gpcheckcat.repair.*" is removed from current working directory And the user runs "psql extra_pk_db -c 'CREATE SCHEMA my_pk_schema' " And the user runs "psql extra_pk_db -f test/behave/mgmt_utils/steps/data/gpcheckcat/add_operator.sql " Then psql should return a return code of 0 - And the user runs "psql extra_pk_db -c "set allow_system_table_mods=true;DELETE FROM pg_catalog.pg_operator where oprname='!#'" " - Then psql should return a return code of 0 + Then The user runs sql "set allow_system_table_mods=true;DELETE FROM pg_catalog.pg_operator where oprname='!#'" in "extra_pk_db" on first primary segment When the user runs "gpcheckcat -R missing_extraneous extra_pk_db" Then gpcheckcat should return a return code of 3 And the path "gpcheckcat.repair.*" is found in cwd "0" times @@ -728,18 +728,13 @@ Feature: gpcheckcat tests And the user runs "dropdb all_good" - Scenario: validate session GUC passed with -x is set + Scenario: gpcheckcat accepts session GUC passed with -x in single node mode Given the database is not running And the user runs "gpstart -ma" And "gpstart -ma" should return a return code of 0 - Then the user runs "gpcheckcat -R foreign_key" - Then gpcheckcat should return a return code of 1 - And gpcheckcat should print ".* System was started in single node mode - only utility mode connections are allowed" to stdout Then the user runs "gpcheckcat -x gp_role=utility -R foreign_key" Then gpcheckcat should return a return code of 0 And the user runs "gpstop -ma" And "gpstop -m" should return a return code of 0 And the user runs "gpstart -a" - - diff --git a/gpMgmt/test/behave/mgmt_utils/gpinitsystem.feature b/gpMgmt/test/behave/mgmt_utils/gpinitsystem.feature index 1d69a5403ff..1c9051764bb 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpinitsystem.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpinitsystem.feature @@ -70,6 +70,7 @@ Feature: gpinitsystem tests Given the user runs "gpstate" Then gpstate should return a return code of 0 + @extended Scenario: gpinitsystem creates a backout file when gpinitsystem process terminated Given create demo cluster config And all files in gpAdminLogs directory are deleted @@ -84,6 +85,7 @@ Feature: gpinitsystem tests And gpinitsystem should return a return code of 0 And gpintsystem logs should not contain lines about running backout script + @extended Scenario: gpinitsystem creates a backout file when gpcreateseg process terminated Given create demo cluster config And all files in gpAdminLogs directory are deleted @@ -97,6 +99,7 @@ Feature: gpinitsystem tests And gpinitsystem should return a return code of 0 And gpintsystem logs should not contain lines about running backout script + @extended Scenario: gpinitsystem does not create or need backout file when user terminated very early Given create demo cluster config And all files in gpAdminLogs directory are deleted @@ -333,4 +336,3 @@ Feature: gpinitsystem tests When the user runs command "grep -q '.*gpcreateseg\.sh.*Completed ssh.*' ~/gpAdminLogs/gpinitsystem*log" Then grep should return a return code of 0 And the user runs command "mv ../gpAux/gpdemo/clusterConfigFile.bak ../gpAux/gpdemo/clusterConfigFile" - diff --git a/gpMgmt/test/behave/mgmt_utils/gpmovemirrors.feature b/gpMgmt/test/behave/mgmt_utils/gpmovemirrors.feature index ed6dda775f4..4e4fac9d4e8 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpmovemirrors.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpmovemirrors.feature @@ -103,6 +103,7 @@ Feature: Tests for gpmovemirrors assert that gp_seg_config wasn't updated """ + @extended Scenario Outline: user can if mirrors failed to move initially Given the database is running And all the segments are running @@ -146,6 +147,7 @@ Feature: Tests for gpmovemirrors | run gprecoverseg | some | 0 | 1,2 | running in place full recovery for all failed contents | | run gprecoverseg | all | None | 0,1,2 | running in place full recovery for all failed contents | + @extended @skip_cleanup Scenario: gpmovemirrors can move mirrors even if start fails for some mirrors Given the database is running @@ -181,6 +183,7 @@ Feature: Tests for gpmovemirrors And user can start transactions + @extended @demo_cluster Scenario: gpmovemirrors -i creates recovery_progress.file if some mirrors are moved Given the database is running @@ -204,6 +207,7 @@ Feature: Tests for gpmovemirrors And user can start transactions And all files in gpAdminLogs directory are deleted on all hosts in the cluster + @extended @demo_cluster Scenario: gpmovemirrors -i creates recovery_progress.file if all mirrors are moved Given the database is running diff --git a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature index 3d28dfc11d5..91353a965e3 100644 --- a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature +++ b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature @@ -1,7 +1,7 @@ @gprecoverseg Feature: gprecoverseg tests - Scenario: incremental recovery works with tablespaces + Scenario Outline: incremental recovery works with tablespaces Given the database is running And a tablespace is created with data And user stops all primary processes @@ -117,6 +117,7 @@ Feature: gprecoverseg tests And all the segments are running And the segments are synchronized + @extended Scenario: gprecoverseg full recovery displays pg_controldata success info Given the database is running And all the segments are running @@ -129,6 +130,7 @@ Feature: gprecoverseg tests And the segments are synchronized And check segment conf: postgresql.conf + @extended Scenario: gprecoverseg incremental recovery displays pg_controldata success info Given the database is running And all the segments are running @@ -141,6 +143,7 @@ Feature: gprecoverseg tests And the segments are synchronized And check segment conf: postgresql.conf + @extended Scenario: gprecoverseg full recovery displays pg_controldata success info Given the database is running And all the segments are running @@ -153,6 +156,7 @@ Feature: gprecoverseg tests And the segments are synchronized And check segment conf: postgresql.conf + @extended Scenario: gprecoverseg incremental recovery displays pg_controldata success info Given the database is running And all the segments are running @@ -256,6 +260,7 @@ Feature: gprecoverseg tests And all the segments are running And the segments are synchronized + @extended Scenario: gprecoverseg differential recovery displays rsync progress to the user Given the database is running And all the segments are running @@ -278,6 +283,7 @@ Feature: gprecoverseg tests And verify replication slot internal_wal_replication_slot is available on all the segments And check segment conf: postgresql.conf + @extended Scenario: gprecoverseg does not display rsync progress to the user when --no-progress option is specified Given the database is running And all the segments are running @@ -299,6 +305,7 @@ Feature: gprecoverseg tests And the segments are synchronized And check segment conf: postgresql.conf + @extended Scenario: When gprecoverseg incremental recovery uses pg_rewind to recover and an existing postmaster.pid on the killed primary segment corresponds to a non postgres process Given the database is running And all the segments are running @@ -323,6 +330,7 @@ Feature: gprecoverseg tests And the backup pid file is deleted on "primary" segment And the background pid is killed on "primary" segment + @extended Scenario: Pid does not correspond to any running process Given the database is running And all the segments are running @@ -345,6 +353,7 @@ Feature: gprecoverseg tests And the segments are synchronized And the backup pid file is deleted on "primary" segment + @extended Scenario: pg_isready functions on recovered segments Given the database is running And all the segments are running @@ -362,6 +371,7 @@ Feature: gprecoverseg tests And the segments are synchronized And pg_isready reports all primaries are accepting connections + @extended Scenario: gprecoverseg incremental recovery displays status for mirrors after pg_rewind call Given the database is running And all the segments are running @@ -376,6 +386,7 @@ Feature: gprecoverseg tests And the segments are synchronized And the cluster is rebalanced + @extended @backup_restore_bashrc Scenario: gprecoverseg should not return error when banner configured on host Given the database is running @@ -718,6 +729,7 @@ Feature: gprecoverseg tests And the cluster is recovered in full and rebalanced And the row count from table "test_recoverseg" in "postgres" is verified against the saved data + @extended @demo_cluster Scenario Outline: gprecoverseg differential recovery segments come up even if recovery for one segment fails Given the database is running @@ -1813,6 +1825,7 @@ Feature: gprecoverseg tests And gprecoverseg should return a return code of 0 And the cluster is rebalanced + @extended @demo_cluster Scenario: gprecoverseg recovers segment when config file contains hostname on demo cluster Given the database is running @@ -1831,6 +1844,7 @@ Feature: gprecoverseg tests And the cluster configuration has no segments where "content=0 and status='d'" Then the cluster is rebalanced + @extended @demo_cluster Scenario: gprecoverseg skips recovery when config file contains invalid hostname on demo cluster Given the database is running diff --git a/gpMgmt/test/behave/mgmt_utils/gpssh.feature b/gpMgmt/test/behave/mgmt_utils/gpssh.feature index 07d7736db56..08aa48b3ef5 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpssh.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpssh.feature @@ -24,10 +24,11 @@ Feature: gpssh behave tests And gpssh should print "unable to login to localhost" to stdout And gpssh should print "could not synchronize with original prompt" to stdout + @requires_netem Scenario: gpssh succeeds when network has latency When the user runs command "sudo tc qdisc add dev lo root netem delay 4000ms" Then sudo should return a return code of 0 When the user runs "gpssh -h localhost echo 'hello I am testing'" Then gpssh should return a return code of 0 And gpssh should print "hello I am testing" to stdout - # We depend on environment.py#after_scenario() to delete the artificial latency \ No newline at end of file + # We depend on environment.py#after_scenario() to delete the artificial latency diff --git a/gpMgmt/test/behave/mgmt_utils/gpstate.feature b/gpMgmt/test/behave/mgmt_utils/gpstate.feature index e03c7f7bd8e..49fc475c0e2 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpstate.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpstate.feature @@ -251,7 +251,7 @@ Feature: gpstate tests Scenario: gpstate -m logs mirror details Given a standard local demo cluster is running When the user runs "gpstate -m" - Then gpstate should print "Current GPDB mirror list and status" to stdout + Then gpstate should print "Current CBDB mirror list and status" to stdout And gpstate output looks like | Mirror | Datadir | Port | Status | Data Status | | \S+ | .*/dbfast_mirror1/demoDataDir0 | [0-9]+ | Passive | Synchronized | @@ -263,7 +263,7 @@ Feature: gpstate tests And user stops all primary processes And user can start transactions When the user runs "gpstate -m" - Then gpstate should print "Current GPDB mirror list and status" to stdout + Then gpstate should print "Current CBDB mirror list and status" to stdout And gpstate output looks like | Mirror | Datadir | Port | Status | Data Status | | \S+ | .*/dbfast_mirror1/demoDataDir0 | [0-9]+ | Acting as Primary | Not In Sync | diff --git a/gpMgmt/test/behave/mgmt_utils/minirepro.feature b/gpMgmt/test/behave/mgmt_utils/minirepro.feature index 15e9c666a51..14a7f5fea6a 100644 --- a/gpMgmt/test/behave/mgmt_utils/minirepro.feature +++ b/gpMgmt/test/behave/mgmt_utils/minirepro.feature @@ -28,6 +28,7 @@ Feature: Dump minimum database objects that is related to the query @minirepro_UI Scenario: Database does not exist Given database "nonedb000" does not exist + And the file "/home/gpadmin/test/in.sql" exists and contains "select 1;" When the user runs "minirepro nonedb000 -q ~/test/in.sql -f ~/out.sql" Then minirepro error should contain database "nonedb000" does not exist diff --git a/gpMgmt/test/behave/mgmt_utils/replication_slots.feature b/gpMgmt/test/behave/mgmt_utils/replication_slots.feature index 121c7abf783..56825e5ee2f 100644 --- a/gpMgmt/test/behave/mgmt_utils/replication_slots.feature +++ b/gpMgmt/test/behave/mgmt_utils/replication_slots.feature @@ -1,11 +1,16 @@ @replication_slots Feature: Replication Slots - Scenario: Lifecycle of cluster's replication slots + Scenario: Replication slots are created for a new mirrored cluster Given I have a machine with no cluster When I create a cluster Then the primaries and mirrors should be replicating using replication slots +@extended +Scenario: Replication slots remain correct after failover and rebalance + Given I have a machine with no cluster + And I create a cluster + Given a preferred primary has failed When the user runs "gprecoverseg -a" And gprecoverseg should return a return code of 0 @@ -19,18 +24,27 @@ Feature: Replication Slots And the segments are synchronized And the primaries and mirrors should be replicating using replication slots + @extended + Scenario: Replication slots remain correct after full recovery + Given I have a machine with no cluster + And I create a cluster + When a mirror has crashed And the user runs "gprecoverseg -aFv" And gprecoverseg should return a return code of 0 And the segments are synchronized Then the primaries and mirrors should be replicating using replication slots + @extended + Scenario: Replication slots remain correct after expansion + Given I have a machine with no cluster + And I create a cluster + When I add a segment to the cluster And the segments are synchronized Then the primaries and mirrors should be replicating using replication slots - Scenario: A adding mirrors to a cluster after the primaries have been initialized + Scenario: Replication slots are created when mirrors are added later Given I cluster with no mirrors When I add mirrors to the cluster Then the primaries and mirrors should be replicating using replication slots - diff --git a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py index 5af0e37762e..126d6155454 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py @@ -23,6 +23,7 @@ from gppylib.gparray import GpArray, ROLE_PRIMARY, ROLE_MIRROR from gppylib.commands.gp import SegmentStart, GpStandbyStart, CoordinatorStop from gppylib.commands import gp +from gppylib.commands import unix from gppylib.commands.pg import PgBaseBackup from gppylib.operations.startSegments import MIRROR_MODE_MIRRORLESS from gppylib.operations.buildMirrorSegments import get_recovery_progress_pattern @@ -489,14 +490,6 @@ def impl(context): else: return -@then( 'verify if the gprecoverseg.lock directory is present in coordinator_data_directory') -def impl(context): - gprecoverseg_lock_file = "%s/gprecoverseg.lock" % gp.get_coordinatordatadir() - if not os.path.exists(gprecoverseg_lock_file): - raise Exception('gprecoverseg.lock directory does not exist') - else: - return - @then('verify that lines from recovery_progress.file are present in segment progress files in {logdir}') def impl(context, logdir): @@ -671,11 +664,6 @@ def impl(context, process_name, signal_name): command = "ps ux | grep bin/{0} | awk '{{print $2}}' | xargs kill -{1}".format(process_name, sig.value) run_async_command(context, command) -@when('the user asynchronously sets up to end {process_name} process with SIGHUP') -def impl(context, process_name): - command = "ps ux | grep bin/%s | awk '{print $2}' | xargs kill -9" % (process_name) - run_async_command(context, command) - @when('the user asynchronously sets up to end gpcreateseg process when it starts') def impl(context): # We keep trying to find the gpcreateseg process using ps,grep @@ -4090,7 +4078,13 @@ def impl(context): for host in host_to_pid_map: for pid in host_to_pid_map[host]: - if unix.check_pid_on_remotehost(pid, host): + # gpstop/gpstart can return before every saved pid fully exits. + # Poll briefly to avoid flaking on processes that are already shutting down. + for _ in range(10): + if not unix.check_pid_on_remotehost(pid, host): + break + time.sleep(1) + else: raise Exception("Postgres process {0} not killed on {1}.".format(pid, host)) diff --git a/gpMgmt/test/behave/mgmt_utils/steps/minirepro_mgmt_utils.py b/gpMgmt/test/behave/mgmt_utils/steps/minirepro_mgmt_utils.py index 2a8599f25fc..718b1107909 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/minirepro_mgmt_utils.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/minirepro_mgmt_utils.py @@ -1,6 +1,23 @@ -import os, mmap +import os import re -from test.behave_utils.utils import drop_database_if_exists, drop_table_if_exists +from test.behave_utils.utils import create_database_if_not_exists, drop_database_if_exists, drop_table_if_exists + + +def _find_expected_position(contents, expected): + pos = contents.find(expected) + if pos != -1: + return pos + + ddl_match = re.match(r'CREATE (TABLE|VIEW) (.+)$', expected) + if ddl_match: + object_type, object_name = ddl_match.groups() + quoted_name = re.escape(object_name) + pattern = r'CREATE %s (?:(?:"[^"]+"|[A-Za-z_][A-Za-z0-9_$]*)\.)?"?%s"?' % (object_type, quoted_name) + regex_match = re.search(pattern, contents) + if regex_match: + return regex_match.start() + + return -1 @given('database "{dbname}" does not exist') def impl(context, dbname): @@ -41,9 +58,9 @@ def impl(context, output_file): @then('the output file "{output_file}" should contain "{str_before}" before "{str_after}"') def impl(context, output_file, str_before, str_after): with open(output_file, 'r') as output_f: - s = mmap.mmap(output_f.fileno(), 0, access=mmap.ACCESS_READ) - pos_before = s.find(str_before) - pos_after = s.find(str_after) + s = output_f.read() + pos_before = _find_expected_position(s, str_before) + pos_after = _find_expected_position(s, str_after) if pos_before == -1: raise Exception('%s not found.' % str_before) if pos_after == -1: @@ -54,15 +71,15 @@ def impl(context, output_file, str_before, str_after): @then('the output file "{output_file}" should contain "{search_str}"') def impl(context, output_file, search_str): with open(output_file, 'r') as output_f: - s = mmap.mmap(output_f.fileno(), 0, access=mmap.ACCESS_READ) - if s.find(search_str) == -1: + s = output_f.read() + if _find_expected_position(s, search_str) == -1: raise Exception('%s not found.' % search_str) @then('the output file "{output_file}" should not contain "{search_str}"') def impl(context, output_file, search_str): with open(output_file, 'r') as output_f: - s = mmap.mmap(output_f.fileno(), 0, access=mmap.ACCESS_READ) - if s.find(search_str) != -1: + s = output_f.read() + if _find_expected_position(s, search_str) != -1: raise Exception('%s should not exist.' % search_str) @then('the output file "{output_file}" should be loaded to database "{db_name}" without error') diff --git a/gpMgmt/test/behave/mgmt_utils/steps/recoverseg_mgmt_utils.py b/gpMgmt/test/behave/mgmt_utils/steps/recoverseg_mgmt_utils.py index 7357b614172..12e3f5abe49 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/recoverseg_mgmt_utils.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/recoverseg_mgmt_utils.py @@ -4,15 +4,17 @@ from time import sleep from contextlib import closing +from gppylib import gplog from gppylib.commands.base import Command, ExecutionError, REMOTE, WorkerPool +from gppylib.commands.gp import RECOVERY_REWIND_APPNAME from gppylib.db import dbconn from gppylib.gparray import GpArray, ROLE_PRIMARY, ROLE_MIRROR -from gppylib.programs.clsRecoverSegment_triples import get_segments_with_running_basebackup, is_pg_rewind_running -from gppylib.operations.get_segments_in_recovery import is_seg_in_backup_mode from test.behave_utils.utils import * import platform, shutil from behave import given, when, then +logger = gplog.get_default_logger() + #TODO remove duplication of these functions def _get_gpAdminLogs_directory(): return "%s/gpAdminLogs" % os.path.expanduser("~") @@ -23,6 +25,76 @@ def lines_matching_both(in_str, str_1, str_2): return [line for line in lines if line.count(str_1) and line.count(str_2)] +def get_segments_with_running_basebackup(): + """ + Returns a set of content ids whose source segments currently have + a running pg_basebackup. + """ + sql = "select gp_segment_id from gp_stat_replication where application_name = 'pg_basebackup'" + + try: + with closing(dbconn.connect(dbconn.DbURL())) as conn: + rows = dbconn.query(conn, sql).fetchall() + except Exception as e: + raise Exception("Failed to query gp_stat_replication: %s" % str(e)) + + segments_with_running_basebackup = {row[0] for row in rows} + + if len(segments_with_running_basebackup) == 0: + logger.debug("No basebackup running") + + return segments_with_running_basebackup + + +def is_pg_rewind_running(hostname, port): + """ + Returns true if a pg_rewind process is running for the given segment. + """ + sql = "SELECT count(*) FROM pg_stat_activity WHERE application_name = '{}'".format( + RECOVERY_REWIND_APPNAME + ) + + try: + url = dbconn.DbURL(hostname=hostname, port=port, dbname='template1') + with closing(dbconn.connect(url, utility=True)) as conn: + return dbconn.querySingleton(conn, sql) > 0 + except Exception as e: + raise Exception( + "Failed to query pg_stat_activity for segment hostname: {}, port: {}, error: {}".format( + hostname, str(port), str(e) + ) + ) + + +def is_seg_in_backup_mode(hostname, port): + """ + Returns true if the source segment is already in backup mode. + + Differential recovery uses pg_start_backup() on the source segment, so + a source that is already in backup mode indicates differential recovery + may already be in progress. + """ + logger.debug( + "Checking if backup is already in progress for the source server with host {} and port {}".format( + hostname, port + ) + ) + + sql = "SELECT pg_is_in_backup()" + try: + url = dbconn.DbURL(hostname=hostname, port=port, dbname='template1') + with closing(dbconn.connect(url, utility=True)) as conn: + res = dbconn.querySingleton(conn, sql) + except Exception as e: + raise Exception( + "Failed to query pg_is_in_backup() for segment with hostname {}, port {}, error: {}".format( + hostname, str(port), str(e) + ) + ) + + return res + + @given('the information of contents {contents} is saved') @when('the information of contents {contents} is saved') @then('the information of contents {contents} is saved') diff --git a/gpMgmt/test/behave/mgmt_utils/steps/replication_slots_utils.py b/gpMgmt/test/behave/mgmt_utils/steps/replication_slots_utils.py index aa9b4a011c1..a8e24a6d3f0 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/replication_slots_utils.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/replication_slots_utils.py @@ -138,7 +138,6 @@ def step_impl(context): @given('a preferred primary has failed') def step_impl(context): stop_primary(context, 0) - wait_for_unblocked_transactions(context) @when('primary and mirror switch to non-preferred roles')