fix: eliminate broadcast overhead before MoE ragged_all_to_all

parambole · parambole · commit 1382bd352b69 · 2026-04-01T17:18:12.000Z
Profiling revealed a severe performance bottleneck occurring in every
MoE layer immediately before the ragged_all_to_all collective. The issue
was traced to the use of jnp.zeros for the collective's output buffer,
which forced XLA to broadcast a constant and zero out a large shape.

This commit changes the output buffer initialization from jnp.zeros to
jax.lax.empty.
This change removes the broadcast overhead and improves step time.
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -1168,7 +1168,7 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
             # experts_per_shard > num_experts_per_tok we cannot assign more than num_experts_per_tok to all of the inputs.
             max_local_experts_per_tok = min(local_expert_size, self.config.num_experts_per_tok)
             buffer_size = int(num_expert_parallelism * batch_size * sequence_length * max_local_experts_per_tok)
-            output_shape = jnp.zeros((buffer_size, self.config.emb_dim), dtype=x.dtype)
+            output_shape = jax.lax.empty((buffer_size, self.config.emb_dim), dtype=x.dtype)
 
             x = jax.lax.ragged_all_to_all(
                 x,
@@ -1331,7 +1331,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
           original_inputs_first_dim = batch_size * sequence_length * self.config.num_experts_per_tok
           if sorted_selected_experts.shape[0] != original_inputs_first_dim:
             raise ValueError("original_inputs_first_dim does not match the original tensor" " shape!")
-          output_shape = jnp.zeros(
+          output_shape = jax.lax.empty(
               (
                   original_inputs_first_dim,
                   self.config.emb_dim // self.get_tensor_parallelism_size(),