pillar1.5(R3) ★: NEOX-ordering RoPE for pure Qwen3 + Qwen3.6 batched

unamedkr · claude · unamedkr · commit e411e3da3cda · 2026-04-20T14:16:12.000+09:00
Root cause of long-prompt UTF-8 garbage on Qwen3 family: RoPE
ordering mismatch with HF reference.

- llama.cpp: LLM_ARCH_QWEN3 / QWEN3MOE → LLAMA_ROPE_TYPE_NEOX
  (half-split pairs: q[i] pairs with q[i+head_dim/2])
- Our engine's tq_rope and batched-prefill RoPE both used
  LLaMA-style interleaved pairs (q[2i], q[2i+1])
- R34 fixed this ONLY for the partial-rotary path (Qwen3.5/3.6
  hybrid); pure Qwen3 (full rotary) and tq_forward_batch were
  never converted.

Changes:
1. New tq_rope_neox() in tq_ops.c — half-split variant with same
   TLS sin/cos caching as tq_rope.
2. tq_engine.h exports the new entry point.
3. Per-token self_attn_forward full-rotary else branch: detect
   Qwen3 via gguf arch string or delta_n_heads &gt; 0, dispatch to
   tq_rope_neox. TQ_ROPE_PAIRS=1 opt-out for legacy Qwen2/LLaMA.
4. tq_forward_batch full-rotary RoPE path: same detection and
   half-split rotation for learned-freq (rope_freqs) and fallback
   (tq_rope_neox) branches.

Real-world (Qwen3-0.6B Q4, 50-word synthetic input):
  BEFORE: "lenameuously...catchØ�Williamson" UTF-8 garbage
  AFTER (batched):    " Let me try to understand this"
  AFTER (per-token):  " ... and so on... etc. So, the problem is..."

Pillar 1 R3 BPE fix was necessary (tokens correct) but not
sufficient (RoPE still wrong). Together, pure Qwen3 + Qwen3.5-4B
long-prompt coherence restored.

Qwen3.6-35B short prompts remain coherent; long-prompt partial
coherence needs further investigation (likely DeltaNet-specific
accumulation issue separate from RoPE).

Regression: 15/15 test_models + 4/4 test_tokenizer PASS.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/include/turboquant/tq_engine.h b/include/turboquant/tq_engine.h
@@ -652,6 +652,8 @@ void tq_matmul_rht_q4q2(float* out, const float* x,
 void tq_rmsnorm(float* out, const float* x, const float* weight, int n, float eps);
 void tq_rope(float* q, float* k, int pos, int head_dim,
              int n_heads, int n_kv_heads, float freq_base);
+void tq_rope_neox(float* q, float* k, int pos, int head_dim,
+                  int n_heads, int n_kv_heads, float freq_base);
 void tq_silu(float* x, int n);
 void tq_gelu_tanh(float* x, int n);
 void tq_softmax(float* x, int n);
diff --git a/src/engine/tq_ops.c b/src/engine/tq_ops.c
@@ -1698,8 +1698,11 @@ void tq_rmsnorm(float* out, const float* x, const float* weight, int n, float ep
  * Rotary Positional Embedding (RoPE)
  *
  * Applies rotation to pairs (q[2i], q[2i+1]) based on position.
- * Compatible with LLaMA / Qwen RoPE convention.
- * ============================================================ */
+ * Compatible with LLaMA / Qwen2 RoPE convention.
+ *
+ * NOTE: Qwen3 family (pure Qwen3 AND hybrid Qwen3.5/3.6) uses
+ * LLAMA_ROPE_TYPE_NEOX / IMROPE — half-split pairs (q[i], q[i+half]).
+ * Use tq_rope_neox for those. ============================================================ */
 void tq_rope(float* q, float* k, int pos, int head_dim,
              int n_heads, int n_kv_heads, float freq_base) {
     /* TLS sin/cos cache keyed on (pos, freq_base, head_dim). Identical
@@ -1751,6 +1754,57 @@ void tq_rope(float* q, float* k, int pos, int head_dim,
     }
 }
 
+/* ============================================================
+ * NEOX-style RoPE (Pillar 1.5 R3): half-split pairs (q[i], q[i+half]).
+ * llama.cpp maps Qwen3 / Qwen3MOE / Qwen35 / Qwen35MOE → NEOX/IMROPE.
+ * LLaMA 2 / Qwen2 use tq_rope (interleaved pairs).
+ * Missing this on batched prefill + full-rotary per-token path was
+ * root cause of Qwen3 long-prompt UTF-8 garbage (R7/R8 bug).
+ * ============================================================ */
+void tq_rope_neox(float* q, float* k, int pos, int head_dim,
+                  int n_heads, int n_kv_heads, float freq_base) {
+    int half = head_dim / 2;
+    static __thread float tls_cos[256];
+    static __thread float tls_sin[256];
+    static __thread int   tls_pos = -1;
+    static __thread float tls_base = 0.0f;
+    static __thread int   tls_dim = 0;
+    if (half <= 256 &&
+        (tls_pos != pos || tls_base != freq_base || tls_dim != head_dim)) {
+        for (int i = 0; i < half; i++) {
+            float freq = 1.0f / powf(freq_base, 2.0f * i / head_dim);
+            float theta = pos * freq;
+            tls_cos[i] = cosf(theta);
+            tls_sin[i] = sinf(theta);
+        }
+        tls_pos = pos;
+        tls_base = freq_base;
+        tls_dim = head_dim;
+    }
+    for (int h = 0; h < n_heads; h++) {
+        float* qh = q + h * head_dim;
+        for (int i = 0; i < half; i++) {
+            float cos_t = tls_cos[i];
+            float sin_t = tls_sin[i];
+            float q0 = qh[i];
+            float q1 = qh[i + half];
+            qh[i]        = q0 * cos_t - q1 * sin_t;
+            qh[i + half] = q0 * sin_t + q1 * cos_t;
+        }
+    }
+    for (int h = 0; h < n_kv_heads; h++) {
+        float* kh = k + h * head_dim;
+        for (int i = 0; i < half; i++) {
+            float cos_t = tls_cos[i];
+            float sin_t = tls_sin[i];
+            float k0 = kh[i];
+            float k1 = kh[i + half];
+            kh[i]        = k0 * cos_t - k1 * sin_t;
+            kh[i + half] = k0 * sin_t + k1 * cos_t;
+        }
+    }
+}
+
 /* ============================================================
  * SiLU activation: x[i] = x[i] * sigmoid(x[i])
  * Also known as swish activation.
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -1551,7 +1551,23 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
                 }
             }
         } else {
-            tq_rope(s->q, s->k, pos, head_dim, n_heads, n_kv_heads, rope_base);
+            /* Pillar 1.5 R3: Qwen3 / Qwen3MOE / Qwen3.5 / Qwen3.6 all use
+             * NEOX-ordering RoPE (llama.cpp: LLM_ARCH_QWEN3* → ROPE_NEOX).
+             * Detect via GGUF arch string or delta_n_heads (hybrid family).
+             * Opt-out: TQ_ROPE_PAIRS=1 reverts to LLaMA pairs for legacy. */
+            int use_neox = 0;
+            if (model->gguf_ctx) {
+                tq_gguf_ctx_t* gctx = (tq_gguf_ctx_t*)model->gguf_ctx;
+                if (strstr(gctx->arch, "qwen3") != NULL
+                    || strstr(gctx->arch, "qwen35") != NULL) use_neox = 1;
+            }
+            if (c->delta_n_heads > 0) use_neox = 1; /* Qwen3.5/3.6 hybrid */
+            if (getenv("TQ_ROPE_PAIRS")) use_neox = 0;
+            if (use_neox) {
+                tq_rope_neox(s->q, s->k, pos, head_dim, n_heads, n_kv_heads, rope_base);
+            } else {
+                tq_rope(s->q, s->k, pos, head_dim, n_heads, n_kv_heads, rope_base);
+            }
         }
     }
 
@@ -3613,6 +3629,10 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
                 for (int i = 0; i < kv_dim; i++) VB[(size_t)n * kv_dim + i] += layer->v_bias[i];
         }
         /* 2b. QK-norm (Qwen3 — NULL for Llama). */
+        if (l == 0 && pos_start == 0 && getenv("TQ_DEBUG_PREFILL")) {
+            fprintf(stderr, "[batch-qknorm] L0 q_norm=%p k_norm=%p\n",
+                    (void*)layer->q_norm, (void*)layer->k_norm);
+        }
         if (layer->q_norm) {
             for (int n = 0; n < N; n++) {
                 for (int h = 0; h < c->n_heads; h++) {
@@ -3633,7 +3653,19 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
         /* 3. RoPE + KV cache write (per-token).
          * Mirror tq_forward's RoPE selection: if model->rope_freqs is set
          * (Llama 3.x learned RoPE scaling, 64 freq factors), apply per-pair
-         * factor; otherwise plain interleaved RoPE. */
+         * factor; otherwise plain RoPE (NEOX for Qwen3, LLaMA pairs for others).
+         *
+         * Pillar 1.5 R3: Qwen3 family uses NEOX-ordering per llama.cpp
+         * (LLM_ARCH_QWEN3 -> ROPE_NEOX). Without this on batched prefill,
+         * long-prompt attention was corrupted -> UTF-8 garbage output. */
+        int batch_use_neox = 0;
+        if (model->gguf_ctx) {
+            tq_gguf_ctx_t* gctx = (tq_gguf_ctx_t*)model->gguf_ctx;
+            if (strstr(gctx->arch, "qwen3") != NULL
+                || strstr(gctx->arch, "qwen35") != NULL) batch_use_neox = 1;
+        }
+        if (c->delta_n_heads > 0) batch_use_neox = 1;
+        if (getenv("TQ_ROPE_PAIRS")) batch_use_neox = 0;
         for (int n = 0; n < N; n++) {
             float* qn = QB + (size_t)n * q_dim;
             float* kn = KB + (size_t)n * kv_dim;
@@ -3651,9 +3683,15 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
                         float freq = base / model->rope_freqs[i];
                         float theta = pos * freq;
                         float ct = cosf(theta), st = sinf(theta);
-                        float q0 = qh[2*i], q1 = qh[2*i+1];
-                        qh[2*i]   = q0 * ct - q1 * st;
-                        qh[2*i+1] = q0 * st + q1 * ct;
+                        if (batch_use_neox) {
+                            float q0 = qh[i], q1 = qh[i + rope_pairs];
+                            qh[i]              = q0 * ct - q1 * st;
+                            qh[i + rope_pairs] = q0 * st + q1 * ct;
+                        } else {
+                            float q0 = qh[2*i], q1 = qh[2*i+1];
+                            qh[2*i]   = q0 * ct - q1 * st;
+                            qh[2*i+1] = q0 * st + q1 * ct;
+                        }
                     }
                 }
                 for (int h = 0; h < c->n_kv_heads; h++) {
@@ -3663,14 +3701,25 @@ int tq_forward_batch(tq_model_t* model, tq_state_t* s,
                         float freq = base / model->rope_freqs[i];
                         float theta = pos * freq;
                         float ct = cosf(theta), st = sinf(theta);
-                        float k0 = kh[2*i], k1 = kh[2*i+1];
-                        kh[2*i]   = k0 * ct - k1 * st;
-                        kh[2*i+1] = k0 * st + k1 * ct;
+                        if (batch_use_neox) {
+                            float k0 = kh[i], k1 = kh[i + rope_pairs];
+                            kh[i]              = k0 * ct - k1 * st;
+                            kh[i + rope_pairs] = k0 * st + k1 * ct;
+                        } else {
+                            float k0 = kh[2*i], k1 = kh[2*i+1];
+                            kh[2*i]   = k0 * ct - k1 * st;
+                            kh[2*i+1] = k0 * st + k1 * ct;
+                        }
                     }
                 }
             } else {
-                tq_rope(qn, kn, pos, c->head_dim, c->n_heads, c->n_kv_heads,
-                        c->rope_freq_base);
+                if (batch_use_neox) {
+                    tq_rope_neox(qn, kn, pos, c->head_dim, c->n_heads, c->n_kv_heads,
+                                 c->rope_freq_base);
+                } else {
+                    tq_rope(qn, kn, pos, c->head_dim, c->n_heads, c->n_kv_heads,
+                            c->rope_freq_base);
+                }
             }
             if (n == 0 && l == 0 && dbg) {
                 fprintf(stderr, "[batch] L0 QB (post-RoPE) tok0 [0:8] = ");