AI-Hypercomputer
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build_and_push_docker_image.yml‎
Lines changed: 2 additions & 19 deletions b/‎.github/workflows/build_and_push_docker_image.yml‎
Lines changed: 2 additions & 19 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎LICENSE_HEADER‎
Lines changed: 13 additions & 0 deletions b/‎LICENSE_HEADER‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎benchmarks/maxtext_xpk_runner.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/maxtext_xpk_runner.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/upload_metrics_to_bq.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/upload_metrics_to_bq.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/guides/data_input_pipeline.md‎
Lines changed: 20 additions & 13 deletions b/‎docs/guides/data_input_pipeline.md‎
Lines changed: 20 additions & 13 deletions
diff --git a/‎docs/guides/data_input_pipeline/data_input_grain.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/guides/data_input_pipeline/data_input_grain.md‎
Lines changed: 4 additions & 4 deletions
@@ -22,7 +22,7 @@ tests/inference/ @vipannalla @mitalisi @gpolovets1 @mailvijayasingh @jrplatin @p
 src/maxtext/inference @vipannalla @mitalisi @gpolovets1 @mailvijayasingh @jrplatin @patemotter @lumosis @richjames0
 
 # Dockerfiles and dependencies
-src/dependencies/ @bvandermoon @parambole @richjames0 @shralex
+src/dependencies/ @bvandermoon @SurbhiJainUSC @parambole @richjames0 @shralex
 
 # Docs
 docs/ @jacoguzo @bvandermoon @richjames0 @shralex @gobbleturk @RissyRan @gagika @A9isha @jiangjy1982 @vipannalla
 
@@ -54,6 +54,7 @@ jobs:
     runs-on: linux-x86-n2-16-buildkit
     container: google/cloud-sdk:524.0.0
     if: >
+      github.event_name == 'release' ||
       github.event_name == 'schedule' ||
       github.event_name == 'pull_request' ||
       github.event_name == 'workflow_dispatch' && (
@@ -86,15 +87,8 @@ jobs:
           # This ensures that every job clones the exact same commit as "setup" job
           ref: ${{ inputs.maxtext_sha }}
 
-      - name: Checkout post-training dependencies
-        if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
-        run: |
-          git clone https://github.com/google/tunix.git ./tunix
-          git clone https://github.com/vllm-project/vllm.git ./vllm
-          git clone https://github.com/vllm-project/tpu-inference.git ./tpu-inference
-
       - name: Mark git repositories as safe
-        run: git config --global --add safe.directory '*'
+        run: git config --global --add safe.directory ${GITHUB_WORKSPACE}
         if: steps.check.outputs.should_run == 'true'
 
       - name: Configure Docker
@@ -123,7 +117,6 @@ jobs:
             MODE=${{ inputs.build_mode }}
             WORKFLOW=${{ inputs.workflow }}
             PACKAGE_DIR=./src
-            TESTS_DIR=./tests
             JAX_VERSION=NONE
             LIBTPU_VERSION=NONE
             INCLUDE_TEST_ASSETS=true
@@ -149,16 +142,6 @@ jobs:
             # Add MaxText tag
             maxtext_hash=$(git rev-parse --short HEAD)
             gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
-
-          # Add post-training dependencies tags
-          if [ "${{ inputs.workflow }}" == "post-training" ]; then
-            for dir in tunix vllm tpu-inference; do
-              if [ -d "./$dir" ]; then
-                dir_hash=$(git -C "$dir" rev-parse --short HEAD)
-                gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
-                fi
-              done
-            fi
           fi
         env:
           INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
 
@@ -52,7 +52,6 @@ repos:
         args:
           - '--pyink-indentation=2'
           - '--line-length=122'
-          - '--check'
 
   - repo: https://github.com/executablebooks/mdformat
     rev: 0.7.22
 
@@ -0,0 +1,13 @@
+ Copyright 2023–2026 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
@@ -428,7 +428,7 @@ def build_user_command(
   if wl_config.hlo_dump:
     hlo_dump = "XLA_FLAGS='--xla_dump_large_constants --xla_dump_to=/tmp/xla_dump'"
     upload_hlo_dump = (
-        f" && gsutil -m cp -r /tmp/xla_dump  {wl_config.base_output_directory}/{wl_config.run_name}/hlo_dump"
+        f" && gcloud storage cp -r /tmp/xla_dump  {wl_config.base_output_directory}/{wl_config.run_name}/hlo_dump"
     )
   # Construct the command string with proper formatting and line continuations
   command = " ".join(
 
@@ -187,7 +187,7 @@ def add_parser_arguments(parser: argparse.ArgumentParser):
 
 
 def download_metrics_file_locally(metrics_gcs_file: str, local_file: str) -> int:
-  command = f"gsutil cp -r {metrics_gcs_file} {local_file}"
+  command = f"gcloud storage cp --recursive {metrics_gcs_file} {local_file}"
   return run_command_with_updates(command, f"Download {metrics_gcs_file} in {local_file}")
 
 
 
@@ -15,29 +15,34 @@
 -->
 
 (data-input-pipeline)=
+
 # Data pipelines
 
 Currently MaxText has three data input pipelines:
 
-| Pipeline | Dataset formats | Features | Limitations |
-| -------- | --------------- | -------- | ----------- |
-| **[Grain](data_input_pipeline/data_input_grain.md)** (recommended)| [ArrayRecord](https://github.com/google/array_record) (random access, available through [Tensorflow Datasets](https://www.tensorflow.org/datasets/catalog/overview), or [conversion](https://github.com/google/array_record/tree/main/beam))<br>[Parquet](https://arrow.apache.org/docs/python/parquet.html) (sequential access) | With arrayrecord: fully deterministic, resilient to preemption; global shuffle <br>With parquet: performant; fully deterministic, resilient to preemption; hierarchical shuffle |  |
-| **[Hugging Face](data_input_pipeline/data_input_hf.md)** | datasets in [Hugging Face Hub](https://huggingface.co/datasets)<br>local/Cloud Storage datasets in json, parquet, arrow, csv, txt (sequential access) | no download needed, convenience; <br>multiple formats | limit scalability using the Hugging Face Hub (no limit using Cloud Storage); <br>non-deterministic with preemption<br>(deterministic without preemption)<br> |
-| **[TFDS](data_input_pipeline/data_input_tfds.md)** | TFRecord (sequential access), available through [Tensorflow Datasets](https://www.tensorflow.org/datasets/catalog/overview) | performant | only supports TFRecords; <br>non-deterministic with preemption<br>(deterministic without preemption) |
+| Pipeline                                                           | Dataset formats                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | Features                                                                                                                                                                        | Limitations                                                                                                                                                  |
+| ------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **[Grain](data_input_pipeline/data_input_grain.md)** (recommended) | [ArrayRecord](https://github.com/google/array_record) (random access, available through [Tensorflow Datasets](https://www.tensorflow.org/datasets/catalog/overview), or [conversion](https://github.com/google/array_record/tree/main/beam))<br>[TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord)(sequential access, available through [Tensorflow Datasets](https://www.tensorflow.org/datasets/catalog/overview))<br>[Parquet](https://arrow.apache.org/docs/python/parquet.html) (sequential access) | With arrayrecord: fully deterministic, resilient to preemption; global shuffle <br>With parquet: performant; fully deterministic, resilient to preemption; hierarchical shuffle |                                                                                                                                                              |
+| **[Hugging Face](data_input_pipeline/data_input_hf.md)**           | datasets in [Hugging Face Hub](https://huggingface.co/datasets)<br>local/Cloud Storage datasets in json, parquet, arrow, csv, txt (sequential access)                                                                                                                                                                                                                                                                                                                                                                     | no download needed, convenience; <br>multiple formats                                                                                                                           | limit scalability using the Hugging Face Hub (no limit using Cloud Storage); <br>non-deterministic with preemption<br>(deterministic without preemption)<br> |
+| **[TFDS](data_input_pipeline/data_input_tfds.md)**                 | TFRecord (sequential access), available through [Tensorflow Datasets](https://www.tensorflow.org/datasets/catalog/overview)                                                                                                                                                                                                                                                                                                                                                                                               | performant                                                                                                                                                                      | only supports TFRecords; <br>non-deterministic with preemption<br>(deterministic without preemption)                                                         |
 
 ## Multihost dataloading best practice
+
 Training in a multi-host environment presents unique challenges for data input pipelines. An effective data loading strategy must address three key issues:
+
 1. **Concurrent access**: Multiple hosts need to read from the same dataset simultaneously without causing conflicts.
 2. **Data uniqueness**: Each host must be fed a unique, non-overlapping subset of the data to ensure the model sees each example correctly.
-3. **Uneven completion**: Handling the scenario where some hosts run out of data before others, which can lead to hanging. 
-The approaches to solve these challenges depend on whether your dataset supports random access or is limited to sequential access.
+3. **Uneven completion**: Handling the scenario where some hosts run out of data before others, which can lead to hanging.
+   The approaches to solve these challenges depend on whether your dataset supports random access or is limited to sequential access.
 
 ### Random access dataset (Recommended)
+
 Random-access formats are highly recommended for multi-host training because they allow any part of the file to be read directly by its index.<br>
 In MaxText, this is best supported by the ArrayRecord format using the Grain input pipeline. This approach gracefully handles the key challenges:
-* **Concurrent access and uniqueness**: Grain assigns a unique set of indices to each host. ArrayRecord allows different hosts to read from different indices in the same file.
 
-* **Uneven completion**: Data indices are distributed evenly among hosts. Without packing, the data imbalance between hosts will be at most one batch. To handle the final steps where some hosts run out of data, you can enable the `generate_padding_batch_train`/`generate_padding_batch_eval` flag in `src/MaxText/config/base.yml` or through command line arguments. This directs hosts to generate empty "padding" batches until the training or evaluation steps are met.
+- **Concurrent access and uniqueness**: Grain assigns a unique set of indices to each host. ArrayRecord allows different hosts to read from different indices in the same file.
+
+- **Uneven completion**: Data indices are distributed evenly among hosts. Without packing, the data imbalance between hosts will be at most one batch. To handle the final steps where some hosts run out of data, you can enable the `generate_padding_batch_train`/`generate_padding_batch_eval` flag in `src/MaxText/config/base.yml` or through command line arguments. This directs hosts to generate empty "padding" batches until the training or evaluation steps are met.
 
 ```{note}
 When sequence packing is enabled, the difference in the number of packed examples per host can be larger. The `generate_padding_batch_train`/`generate_padding_batch_eval` flag still solves this.
@@ -48,12 +53,14 @@ If all hosts exhaust their data before the target step count is reached, both `t
 ```
 
 ### Sequential access dataset
-* **Concurrent access and uniqueness**: Sequential-access datasets (e.g., Parquet, JSON, TFRecord) cannot be accessed by index, requiring a different strategy -- file-based sharding, where each host is given exclusive access to a specific subset of data files. **Key requirement**: `(Number of data files) % (Number of data-loading hosts) == 0`.  If the file count isn't a multiple of the host count, the files will be distributed unevenly. For example, with 10 files and 8 hosts, some hosts will get two files while others get one, significantly worsening the "uneven completion" problem. If you have fewer files than hosts, performance will be severely degraded as all hosts are concurrently accessing all the files.
-* **Uneven completion**: Similar to random-access datasets, you can use the `generate_padding_batch_train`/`generate_padding_batch_eval` flag to handle hosts that finish their file shards early. 
 
-```{toctree}
-:hidden:
+- **Concurrent access and uniqueness**: Sequential-access datasets (e.g., Parquet, JSON, TFRecord) cannot be accessed by index, requiring a different strategy -- file-based sharding, where each host is given exclusive access to a specific subset of data files. **Key requirement**: `(Number of data files) % (Number of data-loading hosts) == 0`. If the file count isn't a multiple of the host count, the files will be distributed unevenly. For example, with 10 files and 8 hosts, some hosts will get two files while others get one, significantly worsening the "uneven completion" problem. If you have fewer files than hosts, performance will be severely degraded as all hosts are concurrently accessing all the files.
+- **Uneven completion**: Similar to random-access datasets, you can use the `generate_padding_batch_train`/`generate_padding_batch_eval` flag to handle hosts that finish their file shards early.
 
+```{toctree}
+---
+hidden:
+---
 data_input_pipeline/data_input_grain
 data_input_pipeline/data_input_hf
 data_input_pipeline/data_input_tfds
 
@@ -32,9 +32,9 @@ Grain ensures determinism in data input pipelines by saving the pipeline's state
 
 ## Using Grain
 
-1. Grain currently supports two data formats: [ArrayRecord](https://github.com/google/array_record) (random access) and [Parquet](https://arrow.apache.org/docs/python/parquet.html) (partial random-access through row groups). Only the ArrayRecord format supports the global shuffle mentioned above. For converting a dataset into ArrayRecord, see [Apache Beam Integration for ArrayRecord](https://github.com/google/array_record/tree/main/beam). Additionally, other random access data sources can be supported via a custom [data source](https://google-grain.readthedocs.io/en/latest/data_sources.html) class.
+1. Grain currently supports three data formats: [ArrayRecord](https://github.com/google/array_record) (random access), [Parquet](https://arrow.apache.org/docs/python/parquet.html) (partial random-access through row groups) and [TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord)(sequential access). Only the ArrayRecord format supports the global shuffle mentioned above. For converting a dataset into ArrayRecord, see [Apache Beam Integration for ArrayRecord](https://github.com/google/array_record/tree/main/beam). Additionally, other random access data sources can be supported via a custom [data source](https://google-grain.readthedocs.io/en/latest/data_sources.html) class.
    - **Community Resource**: The MaxText community has created a [ArrayRecord Documentation](https://array-record.readthedocs.io/). Note: we appreciate the contribution from the community, but as of now it has not been verified by the MaxText or ArrayRecord developers yet.
-2. When the dataset is hosted on a Cloud Storage bucket, Grain can read it through [Cloud Storage FUSE](https://cloud.google.com/storage/docs/gcs-fuse). The installation of Cloud Storage FUSE is included in [setup.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/setup.sh). The user then needs to mount the Cloud Storage bucket to a local path for each worker, using the script [setup_gcsfuse.sh](https://github.com/google/maxtext/blob/main/tools/setup/setup_gcsfuse.sh). The script configures some parameters for the mount.
+2. If the dataset is hosted on a Cloud Storage bucket, the path `gs://` can be provided directly. However, for the best performance, it's recommended to read the bucket through [Cloud Storage FUSE](https://cloud.google.com/storage/docs/gcs-fuse). This will significantly improve the perf for the ArrayRecord format as it allows meta data caching to speeds up random access. The installation of Cloud Storage FUSE is included in [setup.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/setup.sh). The user then needs to mount the Cloud Storage bucket to a local path for each worker, using the script [setup_gcsfuse.sh](https://github.com/google/maxtext/blob/main/tools/setup/setup_gcsfuse.sh). The script configures some parameters for the mount.
 
 ```sh
 bash tools/setup/setup_gcsfuse.sh \
@@ -45,7 +45,7 @@ MOUNT_PATH=${MOUNT_PATH?} \
 
 Note that `FILE_PATH` is optional; when provided, the script runs `ls -R` for pre-filling the metadata cache (see ["Performance tuning best practices" on the Google Cloud documentation](https://cloud.google.com/storage/docs/cloud-storage-fuse/performance#improve-first-time-reads)).
 
-1. Set `dataset_type=grain`, `grain_file_type={arrayrecord|parquet}`, `grain_train_files` in `src/maxtext/configs/base.yml` or through command line arguments to match the file pattern on the mounted local path.
+1. Set `dataset_type=grain`, `grain_file_type={arrayrecord|parquet|tfrecord}`, `grain_train_files` in `src/maxtext/configs/base.yml` or through command line arguments to match the file pattern on the mounted local path.
 
 2. Tune `grain_worker_count` for performance. This parameter controls the number of child processes used by Grain (more details in [behind_the_scenes](https://google-grain.readthedocs.io/en/latest/behind_the_scenes.html), [grain_pool.py](https://github.com/google/grain/blob/main/grain/_src/python/grain_pool.py)). If you use a large number of workers, check your config for gcsfuse in [setup_gcsfuse.sh](https://github.com/google/maxtext/blob/main/tools/setup/setup_gcsfuse.sh) to avoid gcsfuse throttling.
 
@@ -112,7 +112,7 @@ Note that `FILE_PATH` is optional; when provided, the script runs `ls -R` for pr
 bash tools/setup/setup_gcsfuse.sh \
 DATASET_GCS_BUCKET=maxtext-dataset \
 MOUNT_PATH=/tmp/gcsfuse && \
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
+python3 -m maxtext.trainers.pre_train.train \
 run_name=<RUN_NAME> base_output_directory=gs://<MY_BUCKET>  \
 dataset_type=grain \
 grain_file_type=arrayrecord # or parquet \
Original file line number	Diff line number	Diff line change
`@@ -428,7 +428,7 @@ def build_user_command(`
`428`	`428`	`if wl_config.hlo_dump:`
`429`	`429`	`hlo_dump = "XLA_FLAGS='--xla_dump_large_constants --xla_dump_to=/tmp/xla_dump'"`
`430`	`430`	`upload_hlo_dump = (`
`431`		`- f" && gsutil -m cp -r /tmp/xla_dump {wl_config.base_output_directory}/{wl_config.run_name}/hlo_dump"`
	`431`	`+ f" && gcloud storage cp -r /tmp/xla_dump {wl_config.base_output_directory}/{wl_config.run_name}/hlo_dump"`
`432`	`432`	`)`
`433`	`433`	`# Construct the command string with proper formatting and line continuations`
`434`	`434`	`command = " ".join(`