Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions .github/workflows/UploadDockerImages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# This workflow builds and pushes MaxText images for both TPU and GPU devices.
# It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch.

name: Build Images
name: Build and Test Images
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't rename this workflow


on:
schedule:
Expand All @@ -32,6 +32,11 @@ on:
- all
- tpu
- gpu
for_dev_test:
description: 'For development test purpose. All images will be added a -test suffix'
required: false
type: boolean
default: false

permissions:
contents: read
Expand All @@ -42,6 +47,7 @@ jobs:
outputs:
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
image_date: ${{ steps.vars.outputs.image_date }}
image_suffix: ${{ steps.vars.outputs.image_suffix }}
steps:
- name: Checkout MaxText
uses: actions/checkout@v5
Expand All @@ -55,6 +61,13 @@ jobs:
# Image date
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT

# If for_dev_test is true, set suffix to -test, otherwise empty
if [[ "${{ github.event.inputs.for_dev_test }}" == "true" ]]; then
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can skip this if...else by introducing an input variable, say image_suffix instead of for_dev_test. image_suffix will be a str, and users can specify the suffix of their own, else it remains empty. This way you can just change the line 88 and similar to image_name: ${{ matrix.image_name }}${{ github.event.inputs.image_suffix }}

echo "image_suffix=-test" >> $GITHUB_OUTPUT
else
echo "image_suffix=" >> $GITHUB_OUTPUT
fi

tpu-pre-training:
name: ${{ matrix.image_name }}
needs: setup
Expand All @@ -72,25 +85,27 @@ jobs:
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}
test_mode: tpu-pre-training

tpu-post-training-nightly:
name: tpu-post-training-nightly
needs: [setup]
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: maxtext_post_training_nightly
image_name: maxtext_post_training_nightly${{ needs.setup.outputs.image_suffix }}
device: tpu
build_mode: nightly
workflow: post-training
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}
test_mode: tpu-post-training

gpu-pre-training:
name: ${{ matrix.image_name }}
Expand All @@ -109,9 +124,10 @@ jobs:
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}
test_mode: gpu-pre-training
83 changes: 71 additions & 12 deletions .github/workflows/build_and_push_docker_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ on:
required: false
type: string
default: ''
test_mode:
description: "Test mode (tpu-pre-training, tpu-post-training, gpu-pre-training)"
required: true
type: string

permissions:
contents: read
Expand All @@ -61,6 +65,8 @@ jobs:
github.event.inputs.target_device == 'tpu' ||
github.event.inputs.target_device == 'gpu'
)
outputs:
should_run: ${{ steps.check.outputs.should_run }} # Map the step output to the job level
steps:
- name: Check if build should run
id: check
Expand All @@ -87,7 +93,9 @@ jobs:
ref: ${{ inputs.maxtext_sha }}

- name: Checkout post-training dependencies
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please rebase, this is now removed.

if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
if: |
steps.check.outputs.should_run == 'true' &&
contains(inputs.image_name, 'post_training_nightly')
run: |
git clone https://github.com/google/tunix.git ./tunix
git clone https://github.com/vllm-project/vllm.git ./vllm
Expand Down Expand Up @@ -127,38 +135,89 @@ jobs:
LIBTPU_VERSION=NONE
INCLUDE_TEST_ASSETS=true

test:
needs: build_and_push
if: |
needs.build_and_push.result == 'success' &&
needs.build_and_push.outputs.should_run == 'true'
strategy:
fail-fast: false
matrix:
flavor: >-
${{ fromJSON('{
"gpu-pre-training": ["gpu-unit", "gpu-integration"],
"tpu-post-training": ["post-training-tpu-unit", "post-training-tpu-integration", "post-training-cpu-unit"],
"tpu-pre-training": ["tpu-unit", "tpu-integration", "cpu-unit"]
}')[inputs.test_mode] }}
uses: ./.github/workflows/run_tests_coordinator.yml
with:
flavor: ${{ matrix.flavor }}
base_image: ${{ inputs.image_name }}:${{ github.run_id }}
is_scheduled_run: true
maxtext_installed: true

notebook-test:
needs: build_and_push
if: |
inputs.test_mode == 'tpu-post-training' &&
needs.build_and_push.result == 'success' &&
needs.build_and_push.outputs.should_run == 'true'
uses: ./.github/workflows/run_jupyter_notebooks.yml
with:
device_type: tpu
device_name: v6e-4
base_image: ${{ inputs.image_name }}:${{ github.run_id }}
cloud_runner: linux-x86-ct6e-180-4tpu
maxtext_installed: true
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

tagging:
needs: [test, notebook-test]
if: |
always() &&
needs.test.result == 'success' &&
(needs.notebook-test.result == 'success' || needs.notebook-test.result == 'skipped')
runs-on: linux-x86-n2-16-buildkit
container: google/cloud-sdk:524.0.0
steps:
- name: Configure Docker
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q

- name: Add tags to Docker image
if: steps.check.outputs.should_run == 'true'
shell: bash
run: |
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"

if [[ $INPUTS_VERSION_NAME ]]; then
echo "Tagging docker images corresponding to PyPI release..."
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_VERSION_NAME}" --quiet
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_VERSION_NAME}" --quiet
else
echo "Tagging docker images corresponding to nightly release..."

# Add date tag
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet

# Convert date to YYYYMMDD format
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)

# Add MaxText tag
maxtext_hash=$(git rev-parse --short HEAD)
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet

# Add post-training dependencies tags
if [ "${{ inputs.workflow }}" == "post-training" ]; then
for dir in tunix vllm tpu-inference; do
if [ -d "./$dir" ]; then
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${maxtext_hash}_${clean_date}" --quiet

# Add post-training dependencies tags
if [ "${{ inputs.workflow }}" == "post-training" ]; then
for dir in tunix vllm tpu-inference; do
if [ -d "./$dir" ]; then
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${dir}_${dir_hash}_${clean_date}" --quiet
fi
done
fi
fi
# Latest Tag
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet
env:
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/pypi_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,4 @@ jobs:
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ github.sha }}
version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}
test_mode: ${{ matrix.device}}-${{ matrix.workflow }}
13 changes: 9 additions & 4 deletions .github/workflows/run_tests_against_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,21 @@ jobs:
if [ "${INPUTS_MAXTEXT_INSTALLED}" == "true" ]; then
# Move to the directory where code is baked into the image. See the Dockerfile.
cd /deps
REPO_ROOT="/deps"

PYTHON_EXE="python3"
# Disable coverage flags when testing against a pre-installed package
PYTEST_COV_ARGS=""
else
REPO_ROOT="${{ github.workspace }}"

# Use the local virtual environment created in Step 3
PYTHON_EXE=".venv/bin/python3"
# Ensure pytest-cov is available and enable coverage flags
$PYTHON_EXE -m pip install --quiet pytest-cov
PYTEST_COV_ARGS="--cov=src/MaxText --cov=maxtext --cov-report=xml --cov-report=term"
PYTEST_COV_ARGS="--cov=src/MaxText --cov=src/maxtext --cov-report=xml --cov-report=term"
fi
export PYTHONPATH="${REPO_ROOT}/src${PYTHONPATH:+:${PYTHONPATH}}"

if [ "${INPUTS_IS_SCHEDULED_RUN}" == "true" ]; then
FINAL_PYTEST_MARKER="${INPUTS_PYTEST_MARKER}"
Expand Down Expand Up @@ -165,16 +170,16 @@ jobs:
--durations=0 \
$PYTEST_COV_ARGS \
$SPLIT_ARGS \
${INPUTS_PYTEST_EXTRA_ARGS}
${INPUTS_PYTEST_EXTRA_ARGS} \
./tests ./src

env:
PYTHONPATH: "${{ github.workspace }}/src"
INPUTS_IS_SCHEDULED_RUN: ${{ inputs.is_scheduled_run }}
INPUTS_PYTEST_MARKER: ${{ inputs.pytest_marker }}
INPUTS_DEVICE_TYPE: ${{ inputs.device_type }}
INPUTS_PYTEST_ADDOPTS: ${{ inputs.pytest_addopts }}
INPUTS_TOTAL_WORKERS: ${{ inputs.total_workers }}
INPUTS_WORKER_GROUP: ${{ inputs.total_workers }}
INPUTS_WORKER_GROUP: ${{ inputs.worker_group }}
INPUTS_PYTEST_EXTRA_ARGS: ${{ inputs.pytest_extra_args }}
INPUTS_MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }}
- name: Upload results to Codecov
Expand Down
Loading