diff --git a/.github/workflows/UploadDockerImages.yml b/.github/workflows/UploadDockerImages.yml index eb0c472f2c..1df4fbc659 100644 --- a/.github/workflows/UploadDockerImages.yml +++ b/.github/workflows/UploadDockerImages.yml @@ -15,7 +15,7 @@ # This workflow builds and pushes MaxText images for both TPU and GPU devices. # It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch. -name: Build Images +name: Build and Test Images on: schedule: @@ -32,6 +32,11 @@ on: - all - tpu - gpu + for_dev_test: + description: 'For development test purpose. All images will be added a -test suffix' + required: false + type: boolean + default: false permissions: contents: read @@ -42,6 +47,7 @@ jobs: outputs: maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }} image_date: ${{ steps.vars.outputs.image_date }} + image_suffix: ${{ steps.vars.outputs.image_suffix }} steps: - name: Checkout MaxText uses: actions/checkout@v5 @@ -55,6 +61,13 @@ jobs: # Image date echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT + # If for_dev_test is true, set suffix to -test, otherwise empty + if [[ "${{ github.event.inputs.for_dev_test }}" == "true" ]]; then + echo "image_suffix=-test" >> $GITHUB_OUTPUT + else + echo "image_suffix=" >> $GITHUB_OUTPUT + fi + tpu-pre-training: name: ${{ matrix.image_name }} needs: setup @@ -72,25 +85,27 @@ jobs: dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile uses: ./.github/workflows/build_and_push_docker_image.yml with: - image_name: ${{ matrix.image_name }} + image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }} device: ${{ matrix.device }} build_mode: ${{ matrix.build_mode }} dockerfile: ${{ matrix.dockerfile }} maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }} image_date: ${{ needs.setup.outputs.image_date }} + test_mode: tpu-pre-training tpu-post-training-nightly: name: tpu-post-training-nightly needs: [setup] uses: ./.github/workflows/build_and_push_docker_image.yml with: - image_name: maxtext_post_training_nightly + image_name: maxtext_post_training_nightly${{ needs.setup.outputs.image_suffix }} device: tpu build_mode: nightly workflow: post-training dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }} image_date: ${{ needs.setup.outputs.image_date }} + test_mode: tpu-post-training gpu-pre-training: name: ${{ matrix.image_name }} @@ -109,9 +124,10 @@ jobs: dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile uses: ./.github/workflows/build_and_push_docker_image.yml with: - image_name: ${{ matrix.image_name }} + image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }} device: ${{ matrix.device }} build_mode: ${{ matrix.build_mode }} dockerfile: ${{ matrix.dockerfile }} maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }} image_date: ${{ needs.setup.outputs.image_date }} + test_mode: gpu-pre-training diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml index 5ff6591d82..04afd0a616 100644 --- a/.github/workflows/build_and_push_docker_image.yml +++ b/.github/workflows/build_and_push_docker_image.yml @@ -45,6 +45,10 @@ on: required: false type: string default: '' + test_mode: + description: "Test mode (tpu-pre-training, tpu-post-training, gpu-pre-training)" + required: true + type: string permissions: contents: read @@ -61,6 +65,8 @@ jobs: github.event.inputs.target_device == 'tpu' || github.event.inputs.target_device == 'gpu' ) + outputs: + should_run: ${{ steps.check.outputs.should_run }} # Map the step output to the job level steps: - name: Check if build should run id: check @@ -87,7 +93,9 @@ jobs: ref: ${{ inputs.maxtext_sha }} - name: Checkout post-training dependencies - if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly' + if: | + steps.check.outputs.should_run == 'true' && + contains(inputs.image_name, 'post_training_nightly') run: | git clone https://github.com/google/tunix.git ./tunix git clone https://github.com/vllm-project/vllm.git ./vllm @@ -127,38 +135,89 @@ jobs: LIBTPU_VERSION=NONE INCLUDE_TEST_ASSETS=true + test: + needs: build_and_push + if: | + needs.build_and_push.result == 'success' && + needs.build_and_push.outputs.should_run == 'true' + strategy: + fail-fast: false + matrix: + flavor: >- + ${{ fromJSON('{ + "gpu-pre-training": ["gpu-unit", "gpu-integration"], + "tpu-post-training": ["post-training-tpu-unit", "post-training-tpu-integration", "post-training-cpu-unit"], + "tpu-pre-training": ["tpu-unit", "tpu-integration", "cpu-unit"] + }')[inputs.test_mode] }} + uses: ./.github/workflows/run_tests_coordinator.yml + with: + flavor: ${{ matrix.flavor }} + base_image: ${{ inputs.image_name }}:${{ github.run_id }} + is_scheduled_run: true + maxtext_installed: true + + notebook-test: + needs: build_and_push + if: | + inputs.test_mode == 'tpu-post-training' && + needs.build_and_push.result == 'success' && + needs.build_and_push.outputs.should_run == 'true' + uses: ./.github/workflows/run_jupyter_notebooks.yml + with: + device_type: tpu + device_name: v6e-4 + base_image: ${{ inputs.image_name }}:${{ github.run_id }} + cloud_runner: linux-x86-ct6e-180-4tpu + maxtext_installed: true + secrets: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + + tagging: + needs: [test, notebook-test] + if: | + always() && + needs.test.result == 'success' && + (needs.notebook-test.result == 'success' || needs.notebook-test.result == 'skipped') + runs-on: linux-x86-n2-16-buildkit + container: google/cloud-sdk:524.0.0 + steps: + - name: Configure Docker + run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q + - name: Add tags to Docker image - if: steps.check.outputs.should_run == 'true' shell: bash run: | SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}" + TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}" if [[ $INPUTS_VERSION_NAME ]]; then echo "Tagging docker images corresponding to PyPI release..." - gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_VERSION_NAME}" --quiet + gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_VERSION_NAME}" --quiet else echo "Tagging docker images corresponding to nightly release..." # Add date tag - gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet + gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet # Convert date to YYYYMMDD format clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8) # Add MaxText tag maxtext_hash=$(git rev-parse --short HEAD) - gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet - - # Add post-training dependencies tags - if [ "${{ inputs.workflow }}" == "post-training" ]; then - for dir in tunix vllm tpu-inference; do - if [ -d "./$dir" ]; then - dir_hash=$(git -C "$dir" rev-parse --short HEAD) - gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet + gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${maxtext_hash}_${clean_date}" --quiet + + # Add post-training dependencies tags + if [ "${{ inputs.workflow }}" == "post-training" ]; then + for dir in tunix vllm tpu-inference; do + if [ -d "./$dir" ]; then + dir_hash=$(git -C "$dir" rev-parse --short HEAD) + gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${dir}_${dir_hash}_${clean_date}" --quiet fi done fi fi + # Latest Tag + gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet env: INPUTS_IMAGE_NAME: ${{ inputs.image_name }} INPUTS_IMAGE_DATE: ${{ inputs.image_date }} diff --git a/.github/workflows/pypi_release.yml b/.github/workflows/pypi_release.yml index 526f2efae2..4f3e39a69c 100644 --- a/.github/workflows/pypi_release.yml +++ b/.github/workflows/pypi_release.yml @@ -123,3 +123,4 @@ jobs: dockerfile: ${{ matrix.dockerfile }} maxtext_sha: ${{ github.sha }} version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }} + test_mode: ${{ matrix.device}}-${{ matrix.workflow }} diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml index 300d3fa30e..e3cd738266 100644 --- a/.github/workflows/run_tests_against_package.yml +++ b/.github/workflows/run_tests_against_package.yml @@ -128,16 +128,21 @@ jobs: if [ "${INPUTS_MAXTEXT_INSTALLED}" == "true" ]; then # Move to the directory where code is baked into the image. See the Dockerfile. cd /deps + REPO_ROOT="/deps" + PYTHON_EXE="python3" # Disable coverage flags when testing against a pre-installed package PYTEST_COV_ARGS="" else + REPO_ROOT="${{ github.workspace }}" + # Use the local virtual environment created in Step 3 PYTHON_EXE=".venv/bin/python3" # Ensure pytest-cov is available and enable coverage flags $PYTHON_EXE -m pip install --quiet pytest-cov - PYTEST_COV_ARGS="--cov=src/MaxText --cov=maxtext --cov-report=xml --cov-report=term" + PYTEST_COV_ARGS="--cov=src/MaxText --cov=src/maxtext --cov-report=xml --cov-report=term" fi + export PYTHONPATH="${REPO_ROOT}/src${PYTHONPATH:+:${PYTHONPATH}}" if [ "${INPUTS_IS_SCHEDULED_RUN}" == "true" ]; then FINAL_PYTEST_MARKER="${INPUTS_PYTEST_MARKER}" @@ -165,16 +170,16 @@ jobs: --durations=0 \ $PYTEST_COV_ARGS \ $SPLIT_ARGS \ - ${INPUTS_PYTEST_EXTRA_ARGS} + ${INPUTS_PYTEST_EXTRA_ARGS} \ + ./tests ./src env: - PYTHONPATH: "${{ github.workspace }}/src" INPUTS_IS_SCHEDULED_RUN: ${{ inputs.is_scheduled_run }} INPUTS_PYTEST_MARKER: ${{ inputs.pytest_marker }} INPUTS_DEVICE_TYPE: ${{ inputs.device_type }} INPUTS_PYTEST_ADDOPTS: ${{ inputs.pytest_addopts }} INPUTS_TOTAL_WORKERS: ${{ inputs.total_workers }} - INPUTS_WORKER_GROUP: ${{ inputs.total_workers }} + INPUTS_WORKER_GROUP: ${{ inputs.worker_group }} INPUTS_PYTEST_EXTRA_ARGS: ${{ inputs.pytest_extra_args }} INPUTS_MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }} - name: Upload results to Codecov