Skip to content

Commit 34f5ef4

Browse files
unamedkrclaude
andcommitted
metal: add per-token flush/op counters for Issue #16 investigation
tq_metal_diag_get/reset return how many times tq_metal_batch_flush() hit the GPU sync path during a run, plus the total ops in those flushes. The PPL tool now prints flushes/token + ops/flush at the end of an eval when TQ_HAS_METAL is set. This gives empirical answers (instead of guesses) to: - How often does the dispatch path actually fire? - How many ops are amortized per flush? Used during Issue #16 investigation to confirm the Q8_0 weight path never enters Metal batch mode (0 flushes/token), narrowing the slowdown source to Q4_K (gguf_w*) and the fused tq_metal_forward_layer Q4 path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 2dcbde4 commit 34f5ef4

2 files changed

Lines changed: 33 additions & 0 deletions

File tree

src/backend/metal/tq_metal_dispatch.m

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,22 @@ void tq_metal_batch_begin(void) {
667667
* Flush the current batch: end encoding, commit, wait, copy results.
668668
* Safe to call when no batch is active (no-op).
669669
*/
670+
/* Diagnostic counters (Issue #16: dispatch overhead investigation).
671+
* Reset by callers via tq_metal_diag_reset(); read via tq_metal_diag_get().
672+
* No-op when batch path isn't taken — overhead is one atomic add per flush. */
673+
static unsigned long g_metal_flush_count = 0;
674+
static unsigned long g_metal_flush_op_count = 0; /* total dispatched ops across flushes */
675+
676+
void tq_metal_diag_reset(void) {
677+
g_metal_flush_count = 0;
678+
g_metal_flush_op_count = 0;
679+
}
680+
681+
void tq_metal_diag_get(unsigned long* flushes, unsigned long* ops) {
682+
if (flushes) *flushes = g_metal_flush_count;
683+
if (ops) *ops = g_metal_flush_op_count;
684+
}
685+
670686
void tq_metal_batch_flush(void) {
671687
if (!tq_batch.active) return;
672688

@@ -677,6 +693,8 @@ void tq_metal_batch_flush(void) {
677693
}
678694

679695
if (tq_batch.cmd_buf && tq_batch.n_copies > 0) {
696+
g_metal_flush_count += 1;
697+
g_metal_flush_op_count += (unsigned long)tq_batch.n_copies;
680698
[tq_batch.cmd_buf commit];
681699
[tq_batch.cmd_buf waitUntilCompleted];
682700

tools/quant.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,21 @@ int main(int argc, char** argv) {
510510
/* Machine-parseable */
511511
fprintf(stderr, "PPL_CSV:%d,%.6f,%.4f\n", n_eval, avg_nll, perplexity);
512512

513+
#ifdef TQ_HAS_METAL
514+
{
515+
extern void tq_metal_diag_get(unsigned long*, unsigned long*);
516+
unsigned long n_flushes = 0, n_ops = 0;
517+
tq_metal_diag_get(&n_flushes, &n_ops);
518+
if (n_flushes > 0 && n_eval > 0) {
519+
fprintf(stderr, "Metal diag: %lu flushes, %lu ops total, "
520+
"%.1f flushes/token, %.1f ops/flush\n",
521+
n_flushes, n_ops,
522+
(double)n_flushes / (double)n_eval,
523+
(double)n_ops / (double)n_flushes);
524+
}
525+
}
526+
#endif
527+
513528
/* JSON output (--json flag) */
514529
if (json_output) {
515530
const char* kv_name = kv_type < TQ_TYPE_COUNT ? tq_type_name(kv_type) : "fp32";

0 commit comments

Comments
 (0)