Skip to content

Commit e622e8f

Browse files
unamedkrclaude
andcommitted
pillar1(R3) ★: BPE stale-entry bug — ROOT CAUSE of all Qwen3 drift
Single-line fix in src/engine/tq_tokenizer.c BPE heap merge loop: if (top.gen != gen[top.pos]) continue; + if (tokens[top.pos] < 0) continue; <-- THIS LINE int ri = next[top.pos]; if (ri >= n_tokens || tokens[ri] < 0) continue; Bug: when position P dies as the RIGHT neighbor of some merge (its tokens[P] set to -1), gen[P] was never bumped. Old heap entries at position P slip through the gen check, resurrect the dead slot by overwriting tokens[P], and corrupt the linked list — producing wrong merged tokens with duplicated/lost characters. Symptom: our engine encoded "Hello" as [32713='Hel', 654='ll'] = literally "Helll" (5 chars: H,e,l,l,l — extra 'l', missing 'o'). HF encoded correctly as [9707='Hello']. This single tokenization corruption was the structural root cause of: - Qwen3-0.6B 1-word prompts producing UTF-8 garbage - Qwen3.5/3.6 "quicck bbrrown fox" character doubling - Qwen3.6-35B ≥40-word prompts → garbage (now coherent) - Phi-3.5 "2+2?" hallucinating "answer" instead of math - Dozens of rounds of transformer/MoE investigation (26-50) After fix: - Qwen3.6-35B "Once upon a time... young programmer" (40+ words) → coherent narrative "The idea intrigued him so much that he decided to create his very own version... named it 'Hamster Run'" - Qwen3.6-35B short programming prompt → perfect Python code - Llama-3.2-3B 100-tok long-form → fully coherent - Phi-3.5 "What is 2+2?" → "The sum of 2 and 2 is equal to four" (actual correct math, was matching broken 'answer' word before) Regression: 15/15 PASS. Phi-3.5 test updated "answer" → "sum" to match the now-correct factual answer. Methodology: HF reference diff (Pillar 1 R1-R2) revealed the token mismatch. Debug added env-gated per-layer hidden state dump (TQ_DUMP_HIDDEN=dir) in tq_forward; kept as debugging infrastructure for future reference-diff work. quant.h (single-header) uses naive O(n²) BPE merge, not affected by this bug. Only split-source engine had the heap-based regression. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c3a71fc commit e622e8f

4 files changed

Lines changed: 63 additions & 2 deletions

File tree

scripts/test_models.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ run_test "Phi-3.5-mini-instruct-Q4_K_M.gguf" "2+2=" "4" STRICT "TQ_NO_METAL=1"
8080
# unrelated math problems). 2026-04-17: TQ_PHI3_SPLIT=1 default exposed
8181
# a precision-loss regression that only surfaced under --chat. Default
8282
# is now SPLIT=0. This test ensures we don't silently re-enable it.
83-
run_test "Phi-3.5-mini-instruct-Q4_K_M.gguf" "What is 2+2?" "answer" STRICT "TQ_NO_METAL=1" "--chat"
83+
run_test "Phi-3.5-mini-instruct-Q4_K_M.gguf" "What is 2+2?" "sum" STRICT "TQ_NO_METAL=1" "--chat"
8484
run_test "gemma-4-e2b-it-Q8_0.gguf" "2+2=" "4" STRICT "TQ_NO_METAL=1 TQ_NO_Q4=1"
8585
# Note: Llama 3.1 8B raw "2+2=" is borderline — FP32 KV gives "5: The Mathematics..."
8686
# and turbo_kv_4b with k128 highres matches FP32. Use COHERENT tier for this model.

src/engine/tq_tokenizer.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1428,8 +1428,16 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
14281428
heap[0] = heap[--heap_size];
14291429
if (heap_size > 0) { SIFT_DOWN(0); }
14301430

1431-
/* Check if stale (position was already merged) */
1431+
/* Check if stale (position was already merged).
1432+
* Pillar 1 Round 3 fix: a position that died as the RIGHT
1433+
* neighbor of some other merge doesn't get its gen[] bumped,
1434+
* so old heap entries at that position slip through the gen
1435+
* check. Additional guard on tokens[top.pos] < 0 catches this
1436+
* case and prevents writing to a dead slot. Root cause of the
1437+
* "Helll" tokenization garbage on Qwen3 (and the long-context
1438+
* "quicck bbrrown" doubling observed on Qwen3.5/3.6). */
14321439
if (top.gen != gen[top.pos]) continue;
1440+
if (tokens[top.pos] < 0) continue;
14331441
int ri = next[top.pos];
14341442
if (ri >= n_tokens || tokens[ri] < 0) continue;
14351443

src/engine/tq_transformer.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2482,6 +2482,24 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
24822482
* (skip if neither)
24832483
* 3. RMSNorm -> SwiGLU FFN -> residual
24842484
* ============================================================ */
2485+
/* Pillar 1: per-layer hidden state dump for reference-diff debugging.
2486+
* Set TQ_DUMP_HIDDEN=/path/to/dir; one file per slot:
2487+
* emb.bin, h0.bin, ..., h{n-1}.bin, post_norm.bin, logits.bin
2488+
* Each file is raw FP32 little-endian, shape implied by model config.
2489+
* Dumps only at pos=0 (first token of prefill/generation) to avoid
2490+
* overwriting across prefill tokens. */
2491+
static void tq_dump_hidden(const char* name, const float* data, int n, int pos) {
2492+
if (pos != 0) return;
2493+
const char* dir = getenv("TQ_DUMP_HIDDEN");
2494+
if (!dir) return;
2495+
char path[512];
2496+
snprintf(path, sizeof(path), "%s/%s.bin", dir, name);
2497+
FILE* f = fopen(path, "wb");
2498+
if (!f) return;
2499+
fwrite(data, sizeof(float), (size_t)n, f);
2500+
fclose(f);
2501+
}
2502+
24852503
float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
24862504
double _fwd_t0 = g_tq_profile_enabled ? tq_now_ns() : 0;
24872505
double _tp = 0; /* profiling timestamp */
@@ -2560,6 +2578,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
25602578
for (int i = 0; i < 8 && i < dim; i++) fprintf(stderr, "%.4f ", s->x[i]);
25612579
fprintf(stderr, "\n");
25622580
}
2581+
tq_dump_hidden("emb", s->x, dim, pos);
25632582

25642583
/* PLE pre-computation: once per token, before the layer loop.
25652584
* Computes ple_input[l] for each layer l from:
@@ -3057,6 +3076,10 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
30573076
for (int i = 0; i < dim; i++) { _s += s->x[i]; _sa += (s->x[i]<0?-s->x[i]:s->x[i]); }
30583077
fprintf(stderr, "[fwd] L%d pos=%d final x sum=%.9f sumabs=%.9f\n", l, pos, _s, _sa);
30593078
}
3079+
{
3080+
char _slot[16]; snprintf(_slot, sizeof(_slot), "h%d", l);
3081+
tq_dump_hidden(_slot, s->x, dim, pos);
3082+
}
30603083
/* Post-layer processing: PLE, layer_output_scale.
30613084
* GPU graph path jumps here after full-layer GPU forward. */
30623085

@@ -3153,6 +3176,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
31533176
for (int i = 0; i < 8 && i < dim; i++) fprintf(stderr, "%.4f ", s->x[i]);
31543177
fprintf(stderr, "\n");
31553178
}
3179+
tq_dump_hidden("post_norm", s->x, dim, pos);
31563180

31573181
/* Step 4: Output projection to vocab logits */
31583182
TQ_PROF_START(_tp);
@@ -3171,6 +3195,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
31713195
}
31723196
TQ_PROF_STOP(_tp, matmul_ns);
31733197
if (g_tq_profile_enabled) g_profile.lmhead_ns += tq_now_ns() - _tp_lm;
3198+
tq_dump_hidden("logits", s->logits, c->vocab_size, pos);
31743199

31753200
if (pos <= 1 && getenv("TQ_DEBUG")) {
31763201
/* Print top-5 logits for debugging */

tools/pillar1/check_tokens.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/* Quick diagnostic: print tokens our engine produces for a prompt. */
2+
#define QUANT_IMPLEMENTATION
3+
#include "../../quant.h"
4+
#include <stdio.h>
5+
#include <string.h>
6+
7+
int main(int argc, char** argv) {
8+
if (argc < 3) { fprintf(stderr, "usage: %s model.gguf \"text\"\n", argv[0]); return 1; }
9+
tq_model_t* m = tq_load_gguf(argv[1]);
10+
if (!m) { fprintf(stderr, "load failed\n"); return 1; }
11+
tq_tokenizer_t t = {0};
12+
tq_load_tokenizer_from_gguf(argv[1], &t);
13+
14+
int toks[128];
15+
int n = tq_encode(&t, argv[2], toks, 128, 0); /* no BOS */
16+
printf("input: %s\n", argv[2]);
17+
printf("tokens (%d):", n);
18+
for (int i = 0; i < n; i++) printf(" %d", toks[i]);
19+
printf("\n");
20+
/* Decode each token back to show content */
21+
for (int i = 0; i < n; i++) {
22+
char buf[64];
23+
int len = tq_decode_token(&t, toks[i], buf, sizeof(buf));
24+
(void)len;
25+
printf(" [%d] = %d -> %s\n", i, toks[i], buf);
26+
}
27+
return 0;
28+
}

0 commit comments

Comments
 (0)