pillar1(R3) ★: BPE stale-entry bug — ROOT CAUSE of all Qwen3 drift

unamedkr · claude · unamedkr · commit e622e8fcc7e3 · 2026-04-20T11:14:50.000+09:00
Single-line fix in src/engine/tq_tokenizer.c BPE heap merge loop:

    if (top.gen != gen[top.pos]) continue;
+   if (tokens[top.pos] &lt; 0) continue;   &lt;-- THIS LINE
    int ri = next[top.pos];
    if (ri &gt;= n_tokens || tokens[ri] &lt; 0) continue;

Bug: when position P dies as the RIGHT neighbor of some merge (its
tokens[P] set to -1), gen[P] was never bumped. Old heap entries at
position P slip through the gen check, resurrect the dead slot by
overwriting tokens[P], and corrupt the linked list — producing wrong
merged tokens with duplicated/lost characters.

Symptom: our engine encoded "Hello" as [32713='Hel', 654='ll'] =
literally "Helll" (5 chars: H,e,l,l,l — extra 'l', missing 'o').
HF encoded correctly as [9707='Hello']. This single tokenization
corruption was the structural root cause of:
  - Qwen3-0.6B 1-word prompts producing UTF-8 garbage
  - Qwen3.5/3.6 "quicck bbrrown fox" character doubling
  - Qwen3.6-35B ≥40-word prompts → garbage (now coherent)
  - Phi-3.5 "2+2?" hallucinating "answer" instead of math
  - Dozens of rounds of transformer/MoE investigation (26-50)

After fix:
  - Qwen3.6-35B "Once upon a time... young programmer" (40+ words)
    → coherent narrative "The idea intrigued him so much that he
    decided to create his very own version... named it 'Hamster Run'"
  - Qwen3.6-35B short programming prompt → perfect Python code
  - Llama-3.2-3B 100-tok long-form → fully coherent
  - Phi-3.5 "What is 2+2?" → "The sum of 2 and 2 is equal to four"
    (actual correct math, was matching broken 'answer' word before)

Regression: 15/15 PASS. Phi-3.5 test updated "answer" → "sum" to
match the now-correct factual answer.

Methodology: HF reference diff (Pillar 1 R1-R2) revealed the token
mismatch. Debug added env-gated per-layer hidden state dump
(TQ_DUMP_HIDDEN=dir) in tq_forward; kept as debugging infrastructure
for future reference-diff work.

quant.h (single-header) uses naive O(n²) BPE merge, not affected by
this bug. Only split-source engine had the heap-based regression.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/scripts/test_models.sh b/scripts/test_models.sh
@@ -80,7 +80,7 @@ run_test "Phi-3.5-mini-instruct-Q4_K_M.gguf"   "2+2=" "4" STRICT "TQ_NO_METAL=1"
 # unrelated math problems). 2026-04-17: TQ_PHI3_SPLIT=1 default exposed
 # a precision-loss regression that only surfaced under --chat. Default
 # is now SPLIT=0. This test ensures we don't silently re-enable it.
-run_test "Phi-3.5-mini-instruct-Q4_K_M.gguf"   "What is 2+2?" "answer" STRICT "TQ_NO_METAL=1" "--chat"
+run_test "Phi-3.5-mini-instruct-Q4_K_M.gguf"   "What is 2+2?" "sum" STRICT "TQ_NO_METAL=1" "--chat"
 run_test "gemma-4-e2b-it-Q8_0.gguf"            "2+2=" "4" STRICT "TQ_NO_METAL=1 TQ_NO_Q4=1"
 # Note: Llama 3.1 8B raw "2+2=" is borderline — FP32 KV gives "5: The Mathematics..."
 # and turbo_kv_4b with k128 highres matches FP32. Use COHERENT tier for this model.
diff --git a/src/engine/tq_tokenizer.c b/src/engine/tq_tokenizer.c
@@ -1428,8 +1428,16 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
             heap[0] = heap[--heap_size];
             if (heap_size > 0) { SIFT_DOWN(0); }
 
-            /* Check if stale (position was already merged) */
+            /* Check if stale (position was already merged).
+             * Pillar 1 Round 3 fix: a position that died as the RIGHT
+             * neighbor of some other merge doesn't get its gen[] bumped,
+             * so old heap entries at that position slip through the gen
+             * check. Additional guard on tokens[top.pos] < 0 catches this
+             * case and prevents writing to a dead slot. Root cause of the
+             * "Helll" tokenization garbage on Qwen3 (and the long-context
+             * "quicck bbrrown" doubling observed on Qwen3.5/3.6). */
             if (top.gen != gen[top.pos]) continue;
+            if (tokens[top.pos] < 0) continue;
             int ri = next[top.pos];
             if (ri >= n_tokens || tokens[ri] < 0) continue;
 
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -2482,6 +2482,24 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
  *      (skip if neither)
  *   3. RMSNorm -> SwiGLU FFN -> residual
  * ============================================================ */
+/* Pillar 1: per-layer hidden state dump for reference-diff debugging.
+ * Set TQ_DUMP_HIDDEN=/path/to/dir; one file per slot:
+ *   emb.bin, h0.bin, ..., h{n-1}.bin, post_norm.bin, logits.bin
+ * Each file is raw FP32 little-endian, shape implied by model config.
+ * Dumps only at pos=0 (first token of prefill/generation) to avoid
+ * overwriting across prefill tokens. */
+static void tq_dump_hidden(const char* name, const float* data, int n, int pos) {
+    if (pos != 0) return;
+    const char* dir = getenv("TQ_DUMP_HIDDEN");
+    if (!dir) return;
+    char path[512];
+    snprintf(path, sizeof(path), "%s/%s.bin", dir, name);
+    FILE* f = fopen(path, "wb");
+    if (!f) return;
+    fwrite(data, sizeof(float), (size_t)n, f);
+    fclose(f);
+}
+
 float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
     double _fwd_t0 = g_tq_profile_enabled ? tq_now_ns() : 0;
     double _tp = 0;  /* profiling timestamp */
@@ -2560,6 +2578,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         for (int i = 0; i < 8 && i < dim; i++) fprintf(stderr, "%.4f ", s->x[i]);
         fprintf(stderr, "\n");
     }
+    tq_dump_hidden("emb", s->x, dim, pos);
 
     /* PLE pre-computation: once per token, before the layer loop.
      * Computes ple_input[l] for each layer l from:
@@ -3057,6 +3076,10 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
             for (int i = 0; i < dim; i++) { _s += s->x[i]; _sa += (s->x[i]<0?-s->x[i]:s->x[i]); }
             fprintf(stderr, "[fwd]   L%d pos=%d final x sum=%.9f sumabs=%.9f\n", l, pos, _s, _sa);
         }
+        {
+            char _slot[16]; snprintf(_slot, sizeof(_slot), "h%d", l);
+            tq_dump_hidden(_slot, s->x, dim, pos);
+        }
         /* Post-layer processing: PLE, layer_output_scale.
          * GPU graph path jumps here after full-layer GPU forward. */
 
@@ -3153,6 +3176,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         for (int i = 0; i < 8 && i < dim; i++) fprintf(stderr, "%.4f ", s->x[i]);
         fprintf(stderr, "\n");
     }
+    tq_dump_hidden("post_norm", s->x, dim, pos);
 
     /* Step 4: Output projection to vocab logits */
     TQ_PROF_START(_tp);
@@ -3171,6 +3195,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
     }
     TQ_PROF_STOP(_tp, matmul_ns);
     if (g_tq_profile_enabled) g_profile.lmhead_ns += tq_now_ns() - _tp_lm;
+    tq_dump_hidden("logits", s->logits, c->vocab_size, pos);
 
     if (pos <= 1 && getenv("TQ_DEBUG")) {
         /* Print top-5 logits for debugging */
diff --git a/tools/pillar1/check_tokens.c b/tools/pillar1/check_tokens.c
@@ -0,0 +1,28 @@
+/* Quick diagnostic: print tokens our engine produces for a prompt. */
+#define QUANT_IMPLEMENTATION
+#include "../../quant.h"
+#include <stdio.h>
+#include <string.h>
+
+int main(int argc, char** argv) {
+    if (argc < 3) { fprintf(stderr, "usage: %s model.gguf \"text\"\n", argv[0]); return 1; }
+    tq_model_t* m = tq_load_gguf(argv[1]);
+    if (!m) { fprintf(stderr, "load failed\n"); return 1; }
+    tq_tokenizer_t t = {0};
+    tq_load_tokenizer_from_gguf(argv[1], &t);
+
+    int toks[128];
+    int n = tq_encode(&t, argv[2], toks, 128, 0);  /* no BOS */
+    printf("input: %s\n", argv[2]);
+    printf("tokens (%d):", n);
+    for (int i = 0; i < n; i++) printf(" %d", toks[i]);
+    printf("\n");
+    /* Decode each token back to show content */
+    for (int i = 0; i < n; i++) {
+        char buf[64];
+        int len = tq_decode_token(&t, toks[i], buf, sizeof(buf));
+        (void)len;
+        printf("  [%d] = %d -> %s\n", i, toks[i], buf);
+    }
+    return 0;
+}