diff --git a/.github/workflows/build_and_test_maxtext.yml b/.github/workflows/build_and_test_maxtext.yml index f758f428d5..c69202d447 100644 --- a/.github/workflows/build_and_test_maxtext.yml +++ b/.github/workflows/build_and_test_maxtext.yml @@ -108,79 +108,68 @@ jobs: uses: ./.github/workflows/run_jupyter_notebooks.yml strategy: fail-fast: false - matrix: - image_type: ["py312"] with: device_type: tpu device_name: v6e-4 - image_type: ${{ matrix.image_type }} + base_image: maxtext-unit-test-tpu:py312 cloud_runner: linux-x86-ct6e-180-4tpu maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} secrets: HF_TOKEN: ${{ secrets.HF_TOKEN }} - maxtext_cpu_unit_tests: - needs: build_and_upload_maxtext_package + tpu-tests: + needs: [build_and_upload_maxtext_package] if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml + uses: ./.github/workflows/run_tests_coordinator.yml strategy: - fail-fast: false # don't cancel all jobs on failure - matrix: - image_type: ["py312"] - worker_group: [1, 2] + fail-fast: false + matrix: + include: + - flavor: tpu-unit + pip_deps: "" + - flavor: tpu-integration + pip_deps: "" + - flavor: post-training-tpu-unit + pip_deps: "src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt" with: - device_type: cpu - device_name: X64 - cloud_runner: linux-x86-n2-16 - image_type: ${{ matrix.image_type }} - pytest_marker: 'cpu_only and not post_training' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" + flavor: ${{ matrix.flavor }} + base_image: maxtext-unit-test-tpu:py312 is_scheduled_run: ${{ github.event_name == 'schedule' }} - worker_group: ${{ matrix.worker_group }} - total_workers: 2 maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} + extra_pip_deps_file: ${{ matrix.pip_deps }} - maxtext_tpu_unit_tests: - needs: build_and_upload_maxtext_package + gpu-tests: + needs: [build_and_upload_maxtext_package] if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml strategy: - fail-fast: false - matrix: - image_type: ["py312"] + fail-fast: false + matrix: + flavor: [gpu-unit, gpu-integration] + uses: ./.github/workflows/run_tests_coordinator.yml with: - device_type: tpu - device_name: v6e-4 - image_type: ${{ matrix.image_type }} - cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" + flavor: ${{ matrix.flavor }} + base_image: maxtext-unit-test-cuda12:py312 is_scheduled_run: ${{ github.event_name == 'schedule' }} maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - maxtext_tpu_integration_tests: - needs: build_and_upload_maxtext_package + cpu-tests: + needs: [build_and_upload_maxtext_package] if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml + uses: ./.github/workflows/run_tests_coordinator.yml strategy: - fail-fast: false - matrix: - image_type: ["py312"] + fail-fast: false + matrix: + include: + - flavor: cpu-unit + pip_deps: "" + - flavor: post-training-cpu-unit + pip_deps: "src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt" with: - device_type: tpu - device_name: v6e-4 - image_type: ${{ matrix.image_type }} - cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" + flavor: ${{ matrix.flavor }} + base_image: maxtext-unit-test-tpu:py312 is_scheduled_run: ${{ github.event_name == 'schedule' }} maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} + extra_pip_deps_file: ${{ matrix.pip_deps }} maxtext_tpu_pathways_unit_tests: needs: build_and_upload_maxtext_package @@ -188,12 +177,10 @@ jobs: uses: ./.github/workflows/run_pathways_tests.yml strategy: fail-fast: false - matrix: - image_type: ["py312"] with: device_type: tpu device_name: v6e-4 - image_type: ${{ matrix.image_type }} + base_image: maxtext-unit-test-tpu:py312 cloud_runner: linux-x86-ct6e-180-4tpu pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training' xla_python_client_mem_fraction: 0.75 @@ -208,12 +195,10 @@ jobs: uses: ./.github/workflows/run_pathways_tests.yml strategy: fail-fast: false - matrix: - image_type: ["py312"] with: device_type: tpu device_name: v6e-4 - image_type: ${{ matrix.image_type }} + base_image: maxtext-unit-test-tpu:py312 cloud_runner: linux-x86-ct6e-180-4tpu pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training' xla_python_client_mem_fraction: 0.75 @@ -222,95 +207,9 @@ jobs: is_scheduled_run: ${{ github.event_name == 'schedule' }} maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - maxtext_gpu_unit_tests: - needs: build_and_upload_maxtext_package - if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml - strategy: - fail-fast: false - matrix: - image_type: ["py312"] - cuda: ["cuda12"] - with: - device_type: ${{ matrix.cuda }} - device_name: a100-40gb-4 - image_type: ${{ matrix.image_type }} - cloud_runner: linux-x86-a2-48-a100-4gpu - pytest_marker: 'not cpu_only and not tpu_only and not integration_test and not post_training' - xla_python_client_mem_fraction: 0.65 - tf_force_gpu_allow_growth: true - container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} - maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - - maxtext_post_training_cpu_unit_tests: - needs: build_and_upload_maxtext_package - if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml - strategy: - fail-fast: false - matrix: - image_type: ["py312"] - with: - device_type: cpu - device_name: X64 - cloud_runner: linux-x86-n2-16 - image_type: ${{ matrix.image_type }} - pytest_marker: 'cpu_only' - pytest_addopts: 'tests/post_training/unit' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} - extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt' - maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - - maxtext_post_training_tpu_unit_tests: - needs: build_and_upload_maxtext_package - if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml - strategy: - fail-fast: false - matrix: - image_type: ["py312"] - with: - device_type: tpu - device_name: v6e-4 - image_type: ${{ matrix.image_type }} - cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'tpu_only' - pytest_addopts: 'tests/post_training/unit' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} - extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt' - maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - - maxtext_gpu_integration_tests: - needs: build_and_upload_maxtext_package - if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml - strategy: - fail-fast: false - matrix: - image_type: ["py312"] - cuda: ["cuda12"] - with: - device_type: ${{ matrix.cuda }} - device_name: a100-40gb-4 - image_type: ${{ matrix.image_type }} - cloud_runner: linux-x86-a2-48-a100-4gpu - pytest_marker: 'not cpu_only and not tpu_only and integration_test and not post_training' - xla_python_client_mem_fraction: 0.65 - tf_force_gpu_allow_growth: true - container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} - maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - all_tests_passed: name: All Required Tests Passed - needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests] + needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests] if: always() runs-on: ubuntu-latest steps: @@ -324,15 +223,11 @@ jobs: # Otherwise, check that build and all tests passed or were skipped echo "Build result: ${NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT}" - echo "CPU tests: ${NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT}" - echo "TPU tests: ${NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT}" - echo "TPU integration: ${NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT}" - echo "TPU pathways: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}" - echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}" - echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}" - echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}" - echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}" - echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}" + echo "TPU Tests (Matrix) result: ${NEEDS_TPU_TESTS_RESULT}" + echo "GPU Tests (Matrix) result: ${NEEDS_GPU_TESTS_RESULT}" + echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}" + echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}" + echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}" # Fail only if any job failed or was cancelled (skipped is OK) if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then @@ -344,15 +239,11 @@ jobs: env: NEEDS_DOC_ONLY_CHECK_OUTPUTS_RUN_TESTS: ${{ needs.doc_only_check.outputs.run_tests }} NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT: ${{ needs.build_and_upload_maxtext_package.result }} - NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_cpu_unit_tests.result }} - NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_unit_tests.result }} - NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_integration_tests.result }} + NEEDS_CPU_TESTS_RESULT: ${{ needs.cpu-tests.result }} + NEEDS_TPU_TESTS_RESULT: ${{ needs.tpu-tests.result }} + NEEDS_GPU_TESTS_RESULT: ${{ needs.gpu-tests.result }} NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_unit_tests.result }} NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }} - NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_gpu_unit_tests.result }} - NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_gpu_integration_tests.result }} - NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_cpu_unit_tests.result }} - NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_tpu_unit_tests.result }} all_notebooks_passed: name: All Notebooks Passed @@ -385,14 +276,14 @@ jobs: notify_failure: name: Notify failed build # creates an issue or modifies last open existing issue for failed build - needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests] + needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests] if: ${{ always() }} runs-on: ubuntu-latest permissions: issues: write steps: - - name: Check whether one of the jobs failed - if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }} - uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} + - name: Check whether one of the jobs failed + if: ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule' }} + uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/run_jupyter_notebooks.yml b/.github/workflows/run_jupyter_notebooks.yml index 0c7d83e458..0bb6936f3a 100644 --- a/.github/workflows/run_jupyter_notebooks.yml +++ b/.github/workflows/run_jupyter_notebooks.yml @@ -25,15 +25,20 @@ on: device_name: required: true type: string - image_type: - required: false + base_image: + required: true type: string cloud_runner: required: false type: string maxtext_sha: - required: true + required: false type: string + # Flag to skip source checkout and wheel installation + maxtext_installed: + required: false + type: boolean + default: false secrets: HF_TOKEN: required: true @@ -44,17 +49,20 @@ jobs: run: runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }} container: - image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }} + image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }} steps: - name: Checkout MaxText + if: ${{ !inputs.maxtext_installed }} uses: actions/checkout@v5 with: ref: ${{ inputs.maxtext_sha }} - name: Download the MaxText wheel + if: ${{ !inputs.maxtext_installed }} uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 with: name: maxtext-wheel - name: Install MaxText and Dependencies + if: ${{ !inputs.maxtext_installed }} shell: bash run: | # 1. Create virtual environment @@ -65,7 +73,6 @@ jobs: # 2. Install MaxText package and all the post training dependencies uv pip install ${maxtext_wheel}[tpu-post-train] --resolution=lowest install_maxtext_tpu_post_train_extra_deps - .venv/bin/python3 -m ipykernel install --user --name maxtext_venv python3 -m pip freeze - name: Run Post-Training Notebooks @@ -73,13 +80,27 @@ jobs: env: PYTHONPATH: "${{ github.workspace }}/src" HF_TOKEN: ${{ secrets.HF_TOKEN }} + MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }} run: | - source .venv/bin/activate + if [ "${MAXTEXT_INSTALLED}" == "true" ]; then + # Move to the directory where code is baked into the image. See the Dockerfile. + # This is necessary because GHA sets an empty workspace by default. + cd /deps + PYTHON_EXE="python3" + PAPERMILL_EXE="papermill" + else + PYTHON_EXE=".venv/bin/python3" + PAPERMILL_EXE=".venv/bin/papermill" + source .venv/bin/activate + fi export MAXTEXT_REPO_ROOT=$(pwd) export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext export MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/maxtext/examples" + # Install dependencies for running notebooks + $PYTHON_EXE -m ipykernel install --user --name maxtext_venv + for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do filename=$(basename "$notebook") if [[ "$filename" == "sft_qwen3_demo.ipynb" || "$filename" == "sft_llama3_demo_gpu.ipynb" ]]; then @@ -92,7 +113,7 @@ jobs: echo "Running $filename ..." echo "------------------------------------------------------" - papermill "$notebook" "$output_name" -k maxtext_venv + $PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv done - name: Upload Outputs if: always() diff --git a/.github/workflows/run_pathways_tests.yml b/.github/workflows/run_pathways_tests.yml index ecf4182f22..fd2bdd7a8e 100644 --- a/.github/workflows/run_pathways_tests.yml +++ b/.github/workflows/run_pathways_tests.yml @@ -25,8 +25,8 @@ on: device_name: required: true type: string - image_type: - required: false + base_image: + required: true type: string pytest_marker: required: true @@ -61,7 +61,7 @@ jobs: run: runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }} container: - image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-tpu:${{ inputs.image_type != '' && inputs.image_type }} + image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }} env: XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }} TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }} diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml index 0b85ccc94f..300d3fa30e 100644 --- a/.github/workflows/run_tests_against_package.yml +++ b/.github/workflows/run_tests_against_package.yml @@ -25,12 +25,15 @@ on: device_name: required: true type: string - image_type: - required: false + base_image: + required: true type: string pytest_marker: required: true type: string + pytest_extra_args: + required: false + type: string pytest_addopts: required: false type: string @@ -59,12 +62,18 @@ on: type: number default: 1 maxtext_sha: - required: true + description: 'Git SHA to checkout if MaxText is not pre-installed' + required: false type: string extra_pip_deps_file: required: false type: string default: '' + # Flag to skip source checkout and wheel installation + maxtext_installed: + description: 'If false, maxtext_sha must be provided for checkout' + type: boolean + default: false permissions: contents: read @@ -72,7 +81,7 @@ jobs: run: runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }} container: - image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }} + image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }} env: XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }} TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }} @@ -82,14 +91,17 @@ jobs: options: ${{ inputs.container_resource_option }} steps: - name: Checkout MaxText + if: ${{ !inputs.maxtext_installed }} uses: actions/checkout@v5 with: ref: ${{ inputs.maxtext_sha }} - name: Download the maxtext wheel + if: ${{ !inputs.maxtext_installed }} uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 with: name: maxtext-wheel - name: Install the maxtext wheel + if: ${{ !inputs.maxtext_installed }} shell: bash run: | python3 -m uv venv --seed @@ -101,16 +113,32 @@ jobs: python3 -m pip freeze uv pip install pytest-cov - name: Install extra pip deps - if: inputs.extra_pip_deps_file != '' + if: inputs.extra_pip_deps_file != '' && !inputs.maxtext_installed shell: bash run: | source .venv/bin/activate uv pip install -r ${{ inputs.extra_pip_deps_file }} - name: Copy test assets files + if: ${{ !inputs.maxtext_installed }} run : gcloud storage cp gs://maxtext-test-assets/* tests/assets - name: Run Tests shell: bash run: | + # Determine environment and entry directory + if [ "${INPUTS_MAXTEXT_INSTALLED}" == "true" ]; then + # Move to the directory where code is baked into the image. See the Dockerfile. + cd /deps + PYTHON_EXE="python3" + # Disable coverage flags when testing against a pre-installed package + PYTEST_COV_ARGS="" + else + # Use the local virtual environment created in Step 3 + PYTHON_EXE=".venv/bin/python3" + # Ensure pytest-cov is available and enable coverage flags + $PYTHON_EXE -m pip install --quiet pytest-cov + PYTEST_COV_ARGS="--cov=src/MaxText --cov=maxtext --cov-report=xml --cov-report=term" + fi + if [ "${INPUTS_IS_SCHEDULED_RUN}" == "true" ]; then FINAL_PYTEST_MARKER="${INPUTS_PYTEST_MARKER}" else @@ -125,28 +153,32 @@ jobs: if [ "${INPUTS_DEVICE_TYPE}" != "cuda12" ]; then export LIBTPU_INIT_ARGS='--xla_tpu_scoped_vmem_limit_kib=65536' fi - if [ "${{ inputs.total_workers }}" -gt 1 ]; then - .venv/bin/python3 -m pip install --quiet pytest-split pytest-xdist - SPLIT_ARGS="--splits ${{ inputs.total_workers }} --group ${{ inputs.worker_group }} -n auto" + if [ "${INPUTS_TOTAL_WORKERS}" -gt 1 ]; then + $PYTHON_EXE -m pip install --quiet pytest-split pytest-xdist + SPLIT_ARGS="--splits ${INPUTS_TOTAL_WORKERS} --group ${INPUTS_WORKER_GROUP} -n auto" else SPLIT_ARGS="" fi - .venv/bin/python3 -m pytest ${INPUTS_PYTEST_ADDOPTS} \ + $PYTHON_EXE -m pytest ${INPUTS_PYTEST_ADDOPTS} \ -v \ -m "${FINAL_PYTEST_MARKER}" \ --durations=0 \ - --cov=MaxText \ - --cov=maxtext \ - --cov-report=xml \ - --cov-report=term \ - $SPLIT_ARGS + $PYTEST_COV_ARGS \ + $SPLIT_ARGS \ + ${INPUTS_PYTEST_EXTRA_ARGS} + env: PYTHONPATH: "${{ github.workspace }}/src" INPUTS_IS_SCHEDULED_RUN: ${{ inputs.is_scheduled_run }} INPUTS_PYTEST_MARKER: ${{ inputs.pytest_marker }} INPUTS_DEVICE_TYPE: ${{ inputs.device_type }} INPUTS_PYTEST_ADDOPTS: ${{ inputs.pytest_addopts }} + INPUTS_TOTAL_WORKERS: ${{ inputs.total_workers }} + INPUTS_WORKER_GROUP: ${{ inputs.total_workers }} + INPUTS_PYTEST_EXTRA_ARGS: ${{ inputs.pytest_extra_args }} + INPUTS_MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }} - name: Upload results to Codecov + if: ${{ !inputs.maxtext_installed }} # Skip code coverage upload for maxtext image testing uses: codecov/codecov-action@v5 continue-on-error: true with: diff --git a/.github/workflows/run_tests_coordinator.yml b/.github/workflows/run_tests_coordinator.yml new file mode 100644 index 0000000000..0182ec5965 --- /dev/null +++ b/.github/workflows/run_tests_coordinator.yml @@ -0,0 +1,145 @@ +# Copyright 2023-2026 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file defines a module for running tests against the built maxtext package or +# pre-built images. It can run unit tests and integration tests. + +name: MaxText Test Coordinator + +on: + workflow_call: + inputs: + flavor: + description: > + Test flavor ( + tpu-unit, tpu-integration, + post-training-tpu-unit, post-training-tpu-integration, + gpu-unit, gpu-integration, + cpu-unit, + post-training-cpu-unit + ) + required: true + type: string + base_image: + description: 'The docker image to run tests against' + required: true + type: string + extra_pip_deps_file: + required: false + type: string + default: '' + is_scheduled_run: + required: false + type: boolean + default: false + maxtext_sha: + description: 'Git SHA to checkout if MaxText is not pre-installed' + required: false + type: string + maxtext_installed: + description: 'If false, maxtext_sha must be provided for checkout' + type: boolean + default: false + +permissions: + contents: read + +jobs: + execute-test-package: + name: ${{ inputs.flavor }} + strategy: + fail-fast: false + matrix: + worker_group: ${{ fromJSON(contains(inputs.flavor, 'cpu') && '[1, 2]' || '[1]') }} + + uses: ./.github/workflows/run_tests_against_package.yml + with: + # Infrastructure Mapping + device_type: >- + ${{ fromJSON('{ + "tpu-unit": "tpu", + "tpu-integration": "tpu", + "post-training-tpu-unit": "tpu", + "post-training-tpu-integration": "tpu", + "gpu-unit": "cuda12", + "gpu-integration": "cuda12", + "cpu-unit": "cpu", + "post-training-cpu-unit": "cpu" + }')[inputs.flavor] }} + + device_name: >- + ${{ fromJSON('{ + "tpu-unit": "v6e-4", + "tpu-integration": "v6e-4", + "post-training-tpu-unit": "v6e-4", + "post-training-tpu-integration": "v6e-4", + "gpu-unit": "a100-40gb-4", + "gpu-integration": "a100-40gb-4", + "cpu-unit": "X64", + "post-training-cpu-unit": "X64" + }')[inputs.flavor] }} + + cloud_runner: >- + ${{ fromJSON('{ + "tpu-unit": "linux-x86-ct6e-180-4tpu", + "tpu-integration": "linux-x86-ct6e-180-4tpu", + "post-training-tpu-unit": "linux-x86-ct6e-180-4tpu", + "post-training-tpu-integration": "linux-x86-ct6e-180-4tpu", + "gpu-unit": "linux-x86-a2-48-a100-4gpu", + "gpu-integration": "linux-x86-a2-48-a100-4gpu", + "cpu-unit": "linux-x86-n2-16", + "post-training-cpu-unit": "linux-x86-n2-16" + }')[inputs.flavor] }} + # Pytest Marker Mapping + pytest_marker: >- + ${{ fromJSON('{ + "tpu-unit": "not cpu_only and not gpu_only and not integration_test and not post_training", + "tpu-integration": "not cpu_only and not gpu_only and integration_test and not post_training", + "post-training-tpu-unit": "not cpu_only and not gpu_only and not integration_test", + "post-training-tpu-integration": "not cpu_only and not gpu_only and integration_test", + "gpu-unit": "not cpu_only and not gpu_only and integration_test and not post_training", + "gpu-integration": "not cpu_only and not tpu_only and integration_test and not post_training", + "cpu-unit": "cpu_only and not post_training", + "post-training-cpu-unit": "cpu_only" + }')[inputs.flavor] }} + + pytest_extra_args: >- + ${{ fromJSON('{ + "tpu-unit": "--ignore=tests/post_training", + "tpu-integration": "--ignore=tests/post_training", + "post-training-tpu-unit": "", + "post-training-tpu-integration": "", + "gpu-unit": "--ignore=tests/post_training", + "gpu-integration": "--ignore=tests/post_training", + "cpu-unit": "--ignore=tests/post_training", + "post-training-cpu-unit": "" + }')[inputs.flavor] }} + + # Resource Scaling + xla_python_client_mem_fraction: "${{ contains(inputs.flavor, 'gpu') && '0.65' || '0.75' }}" + tf_force_gpu_allow_growth: "${{ contains(inputs.flavor, 'gpu') && 'true' || 'false' }}" + + container_resource_option: >- + ${{ contains(inputs.flavor, 'gpu') + && '--shm-size 2g --runtime=nvidia --gpus all --privileged' + || '--privileged' }} + + # Metadata + base_image: ${{ inputs.base_image }} + is_scheduled_run: ${{ inputs.is_scheduled_run }} + maxtext_installed: ${{ inputs.maxtext_installed }} + worker_group: ${{ matrix.worker_group }} + total_workers: ${{ contains(inputs.flavor, 'cpu') && 2 || 1 }} + maxtext_sha: ${{ inputs.maxtext_sha }} + extra_pip_deps_file: ${{ inputs.extra_pip_deps_file }} \ No newline at end of file