AI-Hypercomputer · eltsai · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 17, 2025
@@ -44,6 +44,7 @@ activations_dtype: 'bfloat16'
 
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
+vae_spatial: -1 # default to total_device * 2 // (dp)
 
 # matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
 # Options are "DEFAULT", "HIGH", "HIGHEST"
@@ -60,7 +61,7 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring
+attention: 'tokamax_flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
@@ -81,9 +82,7 @@ flash_block_sizes: {
   "block_q_dkv" : 512,
   "block_kv_dkv" : 512,
   "block_kv_dkv_compute" : 512,
-  "block_q_dq" : 512,
-  "block_kv_dq" : 512,
-  "use_fused_bwd_kernel": False,
+  "use_fused_bwd_kernel": True
 }
 # Use on v6e
 # flash_block_sizes: {

@@ -44,6 +44,7 @@ activations_dtype: 'bfloat16'
 
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
+vae_spatial: -1
 
 # matmul and conv precision from https://jax.readthedocs.io/en/latest/jax.lax.html#jax.lax.Precision
 # Options are "DEFAULT", "HIGH", "HIGHEST"

@@ -85,9 +85,20 @@ def get_git_commit_hash():
 jax.config.update("jax_use_shardy_partitioner", True)
 
 
-def call_pipeline(config, pipeline, prompt, negative_prompt):
+def call_pipeline(config, pipeline, prompt, negative_prompt, num_inference_steps=None):
+  """Call the pipeline with optional num_inference_steps override.
+
+  Args:
+    config: The configuration object.
+    pipeline: The pipeline to call.
+    prompt: The prompt(s) to use.
+    negative_prompt: The negative prompt(s) to use.
+    num_inference_steps: Optional override for number of inference steps.
+                         If None, uses config.num_inference_steps.
+  """
   model_key = config.model_name
   model_type = config.model_type
+  steps = num_inference_steps if num_inference_steps is not None else config.num_inference_steps
   if model_type == "I2V":
     image = load_image(config.image_url)
     if model_key == WAN2_1:
@@ -98,7 +109,7 @@ def call_pipeline(config, pipeline, prompt, negative_prompt):
           height=config.height,
           width=config.width,
           num_frames=config.num_frames,
-          num_inference_steps=config.num_inference_steps,
+          num_inference_steps=steps,
           guidance_scale=config.guidance_scale,
       )
     elif model_key == WAN2_2:
@@ -109,7 +120,7 @@ def call_pipeline(config, pipeline, prompt, negative_prompt):
           height=config.height,
           width=config.width,
           num_frames=config.num_frames,
-          num_inference_steps=config.num_inference_steps,
+          num_inference_steps=steps,
           guidance_scale_low=config.guidance_scale_low,
           guidance_scale_high=config.guidance_scale_high,
           use_cfg_cache=config.use_cfg_cache,
@@ -124,7 +135,7 @@ def call_pipeline(config, pipeline, prompt, negative_prompt):
           height=config.height,
           width=config.width,
           num_frames=config.num_frames,
-          num_inference_steps=config.num_inference_steps,
+          num_inference_steps=steps,
           guidance_scale=config.guidance_scale,
           use_cfg_cache=config.use_cfg_cache,
       )
@@ -135,7 +146,7 @@ def call_pipeline(config, pipeline, prompt, negative_prompt):
           height=config.height,
           width=config.width,
           num_frames=config.num_frames,
-          num_inference_steps=config.num_inference_steps,
+          num_inference_steps=steps,
           guidance_scale_low=config.guidance_scale_low,
           guidance_scale_high=config.guidance_scale_high,
           use_cfg_cache=config.use_cfg_cache,
@@ -248,6 +259,7 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
   max_logging.log(f"hardware: {jax.devices()[0].platform}")
   max_logging.log(f"number of devices: {jax.device_count()}")
   max_logging.log(f"per_device_batch_size: {config.per_device_batch_size}")
+  max_logging.log(f"vae_spatial: {config.vae_spatial}")
   max_logging.log("============================================================")
 
   compile_time = time.perf_counter() - s0
@@ -276,15 +288,49 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
       max_logging.log(f"generation time per video: {generation_time_per_video}")
     else:
       max_logging.log("Warning: Number of videos is zero, cannot calculate generation_time_per_video.")
-  s0 = time.perf_counter()
+
   if config.enable_profiler:
+    skip_steps = getattr(config, 'skip_first_n_steps_for_profiler', 0)
+    profiler_steps = getattr(config, 'profiler_steps', config.num_inference_steps)
+    profile_all = profiler_steps == -1
+    steps_for_profile = config.num_inference_steps if profile_all else profiler_steps
+
+    if profile_all:
+      max_logging.log(f"Profiler: profiling all {steps_for_profile} inference steps (profiler_steps=-1)")
+    else:
+      max_logging.log(f"Profiler: profiling {steps_for_profile} steps out of {config.num_inference_steps} total")
+    max_logging.log(f"Profiler: skip_first_n_steps={skip_steps}")
+
+    def block_if_jax(x):
+      """Block until ready if x is a JAX array, otherwise no-op."""
+      if hasattr(x, 'block_until_ready'):
+        x.block_until_ready()
+      return x
+
+    for i in range(skip_steps):
+      max_logging.log(f"Profiler warmup iteration {i + 1}/{skip_steps}")
+      warmup_videos = call_pipeline(config, pipeline, prompt, negative_prompt, num_inference_steps=steps_for_profile)
+      # Block until warmup completes
+      jax.tree_util.tree_map(block_if_jax, warmup_videos)
+
+    # Warm up GCS connection by flushing writer before starting profiler
+    if writer and jax.process_index() == 0:
+      max_logging.log("Flushing writer to warm up GCS connection before profiler...")
+      writer.flush()
+
+    s0 = time.perf_counter()
     max_utils.activate_profiler(config)
-    videos = call_pipeline(config, pipeline, prompt, negative_prompt)
+    max_logging.log(f"Profiler: starting profiled run with {steps_for_profile} steps")
+    profiled_videos = call_pipeline(config, pipeline, prompt, negative_prompt, num_inference_steps=steps_for_profile)
+    # Wait for all computation to finish before stopping profiler
+    jax.tree_util.tree_map(block_if_jax, profiled_videos)
     max_utils.deactivate_profiler(config)
+    max_utils.upload_profiler_traces(config)
     generation_time_with_profiler = time.perf_counter() - s0
     max_logging.log(f"generation_time_with_profiler: {generation_time_with_profiler}")
     if writer and jax.process_index() == 0:
       writer.add_scalar("inference/generation_time_with_profiler", generation_time_with_profiler, global_step=0)
+    max_logging.log("Profiler: completed (video not saved)")
 
   return saved_video_path