|
14 | 14 | }, |
15 | 15 | { |
16 | 16 | "attention_mla/inputs_q: bfloat16[768,2048,2048]": { |
17 | | - "logic_axes": "('activation_batch', 'activation_length_no_exp', 'activation_embed')", |
| 17 | + "logic_axes": "('activation_batch', 'activation_length', 'activation_embed')", |
18 | 18 | "PartitionSpec": "P(('data', 'fsdp'), None, None)" |
19 | 19 | } |
20 | 20 | }, |
21 | 21 | { |
22 | 22 | "attention_mla/inputs_kv: bfloat16[768,2048,2048]": { |
23 | | - "logic_axes": "('activation_batch', 'activation_length_no_exp', 'activation_embed')", |
| 23 | + "logic_axes": "('activation_batch', 'activation_length', 'activation_embed')", |
24 | 24 | "PartitionSpec": "P(('data', 'fsdp'), None, None)" |
25 | 25 | } |
26 | 26 | }, |
27 | 27 | { |
28 | 28 | "attention_mla/q_nope: bfloat16[768,2048,16,128]": { |
29 | | - "logic_axes": "('activation_kv_batch', 'activation_length_no_exp', 'activation_kv_heads', 'activation_kv_head_dim')", |
| 29 | + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", |
30 | 30 | "PartitionSpec": "P(('data', 'fsdp'), None, None, None)" |
31 | 31 | } |
32 | 32 | }, |
33 | 33 | { |
34 | 34 | "attention_mla/q_pe: bfloat16[768,2048,16,64]": { |
35 | | - "logic_axes": "('activation_kv_batch', 'activation_length_no_exp', 'activation_kv_heads', 'activation_kv_head_dim')", |
| 35 | + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", |
36 | 36 | "PartitionSpec": "P(('data', 'fsdp'), None, None, None)" |
37 | 37 | } |
38 | 38 | }, |
39 | 39 | { |
40 | 40 | "attention_mla/query: bfloat16[768,2048,16,192]": { |
41 | | - "logic_axes": "('activation_kv_batch', 'activation_length_no_exp', 'activation_kv_heads', 'activation_kv_head_dim')", |
| 41 | + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", |
42 | 42 | "PartitionSpec": "P(('data', 'fsdp'), None, None, None)" |
43 | 43 | } |
44 | 44 | }, |
45 | 45 | { |
46 | 46 | "attention_mla/key_nope: bfloat16[768,2048,16,128]": { |
47 | | - "logic_axes": "('activation_kv_batch', 'activation_length_no_exp', 'activation_kv_heads', 'activation_kv_head_dim')", |
| 47 | + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", |
48 | 48 | "PartitionSpec": "P(('data', 'fsdp'), None, None, None)" |
49 | 49 | } |
50 | 50 | }, |
51 | 51 | { |
52 | 52 | "attention_mla/key_rope: bfloat16[768,2048,16,64]": { |
53 | | - "logic_axes": "('activation_kv_batch', 'activation_length_no_exp', 'activation_kv_heads', 'activation_kv_head_dim')", |
| 53 | + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", |
54 | 54 | "PartitionSpec": "P(('data', 'fsdp'), None, None, None)" |
55 | 55 | } |
56 | 56 | }, |
57 | 57 | { |
58 | 58 | "attention_mla/key: bfloat16[768,2048,16,192]": { |
59 | | - "logic_axes": "('activation_kv_batch', 'activation_length_no_exp', 'activation_kv_heads', 'activation_kv_head_dim')", |
| 59 | + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", |
60 | 60 | "PartitionSpec": "P(('data', 'fsdp'), None, None, None)" |
61 | 61 | } |
62 | 62 | }, |
63 | 63 | { |
64 | 64 | "attention_mla/value: bfloat16[768,2048,16,128]": { |
65 | | - "logic_axes": "('activation_kv_batch', 'activation_length_no_exp', 'activation_kv_heads', 'activation_kv_head_dim')", |
| 65 | + "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')", |
66 | 66 | "PartitionSpec": "P(('data', 'fsdp'), None, None, None)" |
67 | 67 | } |
68 | 68 | }, |
|
86 | 86 | }, |
87 | 87 | { |
88 | 88 | "attention_mla/out: bfloat16[768,2048,16,128]": { |
89 | | - "logic_axes": "('activation_batch', 'activation_length_no_exp', 'activation_heads', 'activation_kv')", |
| 89 | + "logic_axes": "('activation_batch', 'activation_length', 'activation_heads', 'activation_kv')", |
90 | 90 | "PartitionSpec": "P(('data', 'fsdp'), None, None, None)" |
91 | 91 | } |
92 | 92 | }, |
|
104 | 104 | }, |
105 | 105 | { |
106 | 106 | "linears/x: bfloat16[768,2048,10944]": { |
107 | | - "logic_axes": "('activation_batch', 'activation_length_no_exp', 'activation_mlp')", |
| 107 | + "logic_axes": "('activation_batch', 'activation_length', 'activation_mlp')", |
108 | 108 | "PartitionSpec": "P(('data', 'fsdp'), None, None)" |
109 | 109 | } |
110 | 110 | }, |
|
134 | 134 | }, |
135 | 135 | { |
136 | 136 | "linears/x: bfloat16[768,2048,2816]": { |
137 | | - "logic_axes": "('activation_batch', 'activation_length_no_exp', 'activation_mlp')", |
| 137 | + "logic_axes": "('activation_batch', 'activation_length', 'activation_mlp')", |
138 | 138 | "PartitionSpec": "P(('data', 'fsdp'), None, None)" |
139 | 139 | } |
140 | 140 | }, |
|
0 commit comments