Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 54 additions & 163 deletions .github/workflows/build_and_test_maxtext.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,92 +108,79 @@ jobs:
uses: ./.github/workflows/run_jupyter_notebooks.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
with:
device_type: tpu
device_name: v6e-4
image_type: ${{ matrix.image_type }}
base_image: maxtext-unit-test-tpu:py312
cloud_runner: linux-x86-ct6e-180-4tpu
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

maxtext_cpu_unit_tests:
needs: build_and_upload_maxtext_package
tpu-tests:
needs: [build_and_upload_maxtext_package]
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_tests_against_package.yml
uses: ./.github/workflows/run_tests_coordinator.yml
strategy:
fail-fast: false # don't cancel all jobs on failure
matrix:
image_type: ["py312"]
worker_group: [1, 2]
fail-fast: false
matrix:
include:
- flavor: tpu-unit
pip_deps: ""
- flavor: tpu-integration
pip_deps: ""
- flavor: post-training-tpu-unit
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For consistency, let's rename this to tpu-post-training-unit

pip_deps: "src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt"
with:
device_type: cpu
device_name: X64
cloud_runner: linux-x86-n2-16
image_type: ${{ matrix.image_type }}
pytest_marker: 'cpu_only and not post_training'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
flavor: ${{ matrix.flavor }}
base_image: maxtext-unit-test-tpu:py312
is_scheduled_run: ${{ github.event_name == 'schedule' }}
worker_group: ${{ matrix.worker_group }}
total_workers: 2
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
extra_pip_deps_file: ${{ matrix.pip_deps }}

maxtext_tpu_unit_tests:
needs: build_and_upload_maxtext_package
gpu-tests:
needs: [build_and_upload_maxtext_package]
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_tests_against_package.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
fail-fast: false
matrix:
flavor: [gpu-unit, gpu-integration]
uses: ./.github/workflows/run_tests_coordinator.yml
with:
device_type: tpu
device_name: v6e-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
flavor: ${{ matrix.flavor }}
base_image: maxtext-unit-test-cuda12:py312
is_scheduled_run: ${{ github.event_name == 'schedule' }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

maxtext_tpu_integration_tests:
needs: build_and_upload_maxtext_package
cpu-tests:
needs: [build_and_upload_maxtext_package]
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_tests_against_package.yml
uses: ./.github/workflows/run_tests_coordinator.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
fail-fast: false
matrix:
include:
- flavor: cpu-unit
pip_deps: ""
- flavor: post-training-cpu-unit
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same: cpu-post-training-unit

pip_deps: "src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt"
with:
device_type: tpu
device_name: v6e-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
flavor: ${{ matrix.flavor }}
base_image: maxtext-unit-test-tpu:py312
is_scheduled_run: ${{ github.event_name == 'schedule' }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
extra_pip_deps_file: ${{ matrix.pip_deps }}

maxtext_tpu_pathways_unit_tests:
needs: build_and_upload_maxtext_package
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_pathways_tests.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
with:
device_type: tpu
device_name: v6e-4
image_type: ${{ matrix.image_type }}
base_image: maxtext-unit-test-tpu:py312
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
xla_python_client_mem_fraction: 0.75
Expand All @@ -208,12 +195,10 @@ jobs:
uses: ./.github/workflows/run_pathways_tests.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
with:
device_type: tpu
device_name: v6e-4
image_type: ${{ matrix.image_type }}
base_image: maxtext-unit-test-tpu:py312
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
xla_python_client_mem_fraction: 0.75
Expand All @@ -222,95 +207,9 @@ jobs:
is_scheduled_run: ${{ github.event_name == 'schedule' }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

maxtext_gpu_unit_tests:
needs: build_and_upload_maxtext_package
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_tests_against_package.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
cuda: ["cuda12"]
with:
device_type: ${{ matrix.cuda }}
device_name: a100-40gb-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-a2-48-a100-4gpu
pytest_marker: 'not cpu_only and not tpu_only and not integration_test and not post_training'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

maxtext_post_training_cpu_unit_tests:
needs: build_and_upload_maxtext_package
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_tests_against_package.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
with:
device_type: cpu
device_name: X64
cloud_runner: linux-x86-n2-16
image_type: ${{ matrix.image_type }}
pytest_marker: 'cpu_only'
pytest_addopts: 'tests/post_training/unit'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

maxtext_post_training_tpu_unit_tests:
needs: build_and_upload_maxtext_package
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_tests_against_package.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
with:
device_type: tpu
device_name: v6e-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-ct6e-180-4tpu
pytest_marker: 'tpu_only'
pytest_addopts: 'tests/post_training/unit'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

maxtext_gpu_integration_tests:
needs: build_and_upload_maxtext_package
if: needs.doc_only_check.outputs.run_tests == 'true'
uses: ./.github/workflows/run_tests_against_package.yml
strategy:
fail-fast: false
matrix:
image_type: ["py312"]
cuda: ["cuda12"]
with:
device_type: ${{ matrix.cuda }}
device_name: a100-40gb-4
image_type: ${{ matrix.image_type }}
cloud_runner: linux-x86-a2-48-a100-4gpu
pytest_marker: 'not cpu_only and not tpu_only and integration_test and not post_training'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}

all_tests_passed:
name: All Required Tests Passed
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
if: always()
runs-on: ubuntu-latest
steps:
Expand All @@ -324,15 +223,11 @@ jobs:

# Otherwise, check that build and all tests passed or were skipped
echo "Build result: ${NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT}"
echo "CPU tests: ${NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT}"
echo "TPU tests: ${NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT}"
echo "TPU integration: ${NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT}"
echo "TPU pathways: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}"
echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}"
echo "TPU Tests (Matrix) result: ${NEEDS_TPU_TESTS_RESULT}"
echo "GPU Tests (Matrix) result: ${NEEDS_GPU_TESTS_RESULT}"
echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}"
echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"

# Fail only if any job failed or was cancelled (skipped is OK)
if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
Expand All @@ -344,15 +239,11 @@ jobs:
env:
NEEDS_DOC_ONLY_CHECK_OUTPUTS_RUN_TESTS: ${{ needs.doc_only_check.outputs.run_tests }}
NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT: ${{ needs.build_and_upload_maxtext_package.result }}
NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_cpu_unit_tests.result }}
NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_unit_tests.result }}
NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_integration_tests.result }}
NEEDS_CPU_TESTS_RESULT: ${{ needs.cpu-tests.result }}
NEEDS_TPU_TESTS_RESULT: ${{ needs.tpu-tests.result }}
NEEDS_GPU_TESTS_RESULT: ${{ needs.gpu-tests.result }}
NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}
NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_gpu_unit_tests.result }}
NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_gpu_integration_tests.result }}
NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_cpu_unit_tests.result }}
NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_tpu_unit_tests.result }}

all_notebooks_passed:
name: All Notebooks Passed
Expand Down Expand Up @@ -385,14 +276,14 @@ jobs:

notify_failure:
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
if: ${{ always() }}
runs-on: ubuntu-latest
permissions:
issues: write
steps:
- name: Check whether one of the jobs failed
if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Check whether one of the jobs failed
if: ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule' }}
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
35 changes: 28 additions & 7 deletions .github/workflows/run_jupyter_notebooks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,20 @@ on:
device_name:
required: true
type: string
image_type:
required: false
base_image:
required: true
type: string
cloud_runner:
required: false
type: string
maxtext_sha:
required: true
required: false
type: string
# Flag to skip source checkout and wheel installation
maxtext_installed:
required: false
type: boolean
default: false
secrets:
HF_TOKEN:
required: true
Expand All @@ -44,17 +49,20 @@ jobs:
run:
runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
container:
image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }}
steps:
- name: Checkout MaxText
if: ${{ !inputs.maxtext_installed }}
uses: actions/checkout@v5
with:
ref: ${{ inputs.maxtext_sha }}
- name: Download the MaxText wheel
if: ${{ !inputs.maxtext_installed }}
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
with:
name: maxtext-wheel
- name: Install MaxText and Dependencies
if: ${{ !inputs.maxtext_installed }}
shell: bash
run: |
# 1. Create virtual environment
Expand All @@ -65,21 +73,34 @@ jobs:
# 2. Install MaxText package and all the post training dependencies
uv pip install ${maxtext_wheel}[tpu-post-train] --resolution=lowest
install_maxtext_tpu_post_train_extra_deps
.venv/bin/python3 -m ipykernel install --user --name maxtext_venv

python3 -m pip freeze
- name: Run Post-Training Notebooks
shell: bash
env:
PYTHONPATH: "${{ github.workspace }}/src"
HF_TOKEN: ${{ secrets.HF_TOKEN }}
MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }}
run: |
source .venv/bin/activate
if [ "${MAXTEXT_INSTALLED}" == "true" ]; then
# Move to the directory where code is baked into the image. See the Dockerfile.
# This is necessary because GHA sets an empty workspace by default.
cd /deps
PYTHON_EXE="python3"
PAPERMILL_EXE="papermill"
else
PYTHON_EXE=".venv/bin/python3"
PAPERMILL_EXE=".venv/bin/papermill"
source .venv/bin/activate
fi

export MAXTEXT_REPO_ROOT=$(pwd)
export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
export MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/maxtext/examples"

# Install dependencies for running notebooks
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is wrong. We are actually registering the venv to the kernel, right?

$PYTHON_EXE -m ipykernel install --user --name maxtext_venv

for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
filename=$(basename "$notebook")
if [[ "$filename" == "sft_qwen3_demo.ipynb" || "$filename" == "sft_llama3_demo_gpu.ipynb" ]]; then
Expand All @@ -92,7 +113,7 @@ jobs:
echo "Running $filename ..."
echo "------------------------------------------------------"

papermill "$notebook" "$output_name" -k maxtext_venv
$PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv
done
- name: Upload Outputs
if: always()
Expand Down
Loading
Loading