@@ -435,7 +435,9 @@ custom_mesh_and_rule: "" # replace default mesh and logical rule by specifying y
435435mesh_axes : ['diloco', 'data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']
436436logical_axis_rules : [
437437 ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
438+ ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
438439 ['activation_batch_no_exp', ['data', 'fsdp', 'fsdp_transpose']],
440+ ['activation_batch_no_exp_moe', ['data', 'fsdp', 'fsdp_transpose']],
439441 ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert']],
440442 ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
441443 ['activation_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence','autoregressive']],
@@ -448,14 +450,18 @@ logical_axis_rules: [
448450 ['activation_attn_length_no_exp', ['context']],
449451 ['activation_length_no_exp', ['sequence', 'context']],
450452 ['activation_length_no_exp', ['context']],
453+ ['activation_length_no_exp_moe', ['sequence', 'context']],
454+ ['activation_length_no_exp_moe', ['context']],
451455 ['activation_norm_length', ['tensor_sequence', 'context', 'sequence']],
456+ ['activation_norm_length_moe', ['tensor_sequence', 'context', 'sequence']],
452457 ['activation_q_length', ['context', 'expert']],
453458 ['activation_q_length_no_exp', ['context']],
454459 ['prefill_activation_length', ['sequence', 'context']],
455460 ['prefill_activation_norm_length', ['tensor_sequence', 'context', 'sequence']],
456461 ['activation_kv_length', []],
457462 ['activation_attn_embed', ['tensor', 'tensor_transpose']],
458463 ['activation_embed', ['tensor', 'tensor_transpose']],
464+ ['activation_embed_moe', ['tensor', 'tensor_transpose']],
459465 ['activation_mlp', ['tensor', 'tensor_transpose', 'tensor_sequence']],
460466 ['activation_kv', ['tensor', 'tensor_transpose', 'tensor_sequence']],
461467 ['activation_prefill_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
@@ -484,6 +490,14 @@ logical_axis_rules: [
484490 ['embed_no_exp', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
485491 ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
486492 ['embed_no_exp', ['fsdp', 'sequence', 'context']],
493+ ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context', 'expert']],
494+ ['embed_moe', ['fsdp', 'sequence', 'tensor_transpose', 'context' , 'expert']],
495+ ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
496+ ['embed_moe', ['fsdp', 'sequence', 'context', 'expert']],
497+ ['embed_no_exp_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context']],
498+ ['embed_no_exp_moe', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
499+ ['embed_no_exp_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
500+ ['embed_no_exp_moe', ['fsdp', 'sequence', 'context']],
487501 ['embed_tensor_transpose', ['tensor_transpose']],
488502 ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'tensor_transpose', 'expert']],
489503 ['q_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
0 commit comments