Skip to content

Commit 2df5399

Browse files
committed
fix: finalize v2.1.0 token budget advisory
1 parent 440ef7a commit 2df5399

6 files changed

Lines changed: 175 additions & 110 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
### Documentation
1919

2020
- publish the v2.1.0 discovery benchmark rerun with the current gate output: `pending_evidence`, `claimAllowed: false`, `24` frozen tasks, `0.75` average usefulness, and `1822.25` average estimated tokens
21-
- document the current comparator truth instead of stale assumptions: the public proof still has no real comparator lane data on this host, so benchmark win claims remain blocked
22-
- note the new `searchQuality.tokenEstimate` advisory contract: estimates are based on the pre-advisory response payload and warnings only appear above the 4K-token threshold
21+
- document the current comparator truth instead of stale assumptions: the public proof still has setup failures plus near-empty comparator outputs on this host, so benchmark win claims remain blocked
22+
- note the new `searchQuality.tokenEstimate` advisory contract: estimates are based on the final serialized response payload and warnings only appear above the 4K-token threshold
2323

2424
### Features
2525

results/comparator-evidence.json

Lines changed: 63 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"averageFirstRelevantHit": null,
77
"bestExampleUsefulnessRate": null,
88
"averageToolCallCount": 1,
9-
"averageElapsedMs": 0.375,
9+
"averageElapsedMs": 0.3333333333333333,
1010
"status": "ok",
1111
"taskResults": [
1212
{
@@ -39,7 +39,7 @@
3939
"payloadBytes": 19,
4040
"estimatedTokens": 5,
4141
"toolCallCount": 1,
42-
"elapsedMs": 0
42+
"elapsedMs": 1
4343
},
4444
{
4545
"taskId": "as-map-03",
@@ -55,7 +55,7 @@
5555
"payloadBytes": 19,
5656
"estimatedTokens": 5,
5757
"toolCallCount": 1,
58-
"elapsedMs": 1
58+
"elapsedMs": 0
5959
},
6060
{
6161
"taskId": "as-map-04",
@@ -129,7 +129,7 @@
129129
"payloadBytes": 19,
130130
"estimatedTokens": 5,
131131
"toolCallCount": 1,
132-
"elapsedMs": 0
132+
"elapsedMs": 1
133133
},
134134
{
135135
"taskId": "as-search-01",
@@ -144,7 +144,7 @@
144144
"payloadBytes": 19,
145145
"estimatedTokens": 5,
146146
"toolCallCount": 1,
147-
"elapsedMs": 1
147+
"elapsedMs": 0
148148
},
149149
{
150150
"taskId": "as-search-02",
@@ -189,7 +189,7 @@
189189
"payloadBytes": 19,
190190
"estimatedTokens": 5,
191191
"toolCallCount": 1,
192-
"elapsedMs": 1
192+
"elapsedMs": 0
193193
},
194194
{
195195
"taskId": "ex-map-01",
@@ -282,7 +282,7 @@
282282
"payloadBytes": 19,
283283
"estimatedTokens": 5,
284284
"toolCallCount": 1,
285-
"elapsedMs": 1
285+
"elapsedMs": 0
286286
},
287287
{
288288
"taskId": "ex-find-03",
@@ -297,7 +297,7 @@
297297
"payloadBytes": 19,
298298
"estimatedTokens": 5,
299299
"toolCallCount": 1,
300-
"elapsedMs": 0
300+
"elapsedMs": 1
301301
},
302302
{
303303
"taskId": "ex-find-04",
@@ -389,12 +389,12 @@
389389
},
390390
"raw Claude Code": {
391391
"averageUsefulness": 0,
392-
"averagePayloadBytes": 66.08333333333333,
393-
"averageEstimatedTokens": 17.166666666666668,
392+
"averagePayloadBytes": 71.54166666666667,
393+
"averageEstimatedTokens": 18.5,
394394
"averageFirstRelevantHit": null,
395395
"bestExampleUsefulnessRate": null,
396396
"averageToolCallCount": null,
397-
"averageElapsedMs": 8944.833333333334,
397+
"averageElapsedMs": 9590.208333333334,
398398
"status": "ok",
399399
"taskResults": [
400400
{
@@ -411,7 +411,7 @@
411411
"payloadBytes": 65,
412412
"estimatedTokens": 17,
413413
"toolCallCount": null,
414-
"elapsedMs": 9148
414+
"elapsedMs": 12461
415415
},
416416
{
417417
"taskId": "as-map-02",
@@ -427,7 +427,7 @@
427427
"payloadBytes": 65,
428428
"estimatedTokens": 17,
429429
"toolCallCount": null,
430-
"elapsedMs": 9291
430+
"elapsedMs": 9390
431431
},
432432
{
433433
"taskId": "as-map-03",
@@ -443,7 +443,7 @@
443443
"payloadBytes": 65,
444444
"estimatedTokens": 17,
445445
"toolCallCount": null,
446-
"elapsedMs": 9344
446+
"elapsedMs": 9836
447447
},
448448
{
449449
"taskId": "as-map-04",
@@ -458,7 +458,7 @@
458458
"payloadBytes": 65,
459459
"estimatedTokens": 17,
460460
"toolCallCount": null,
461-
"elapsedMs": 8200
461+
"elapsedMs": 10098
462462
},
463463
{
464464
"taskId": "as-find-01",
@@ -469,10 +469,10 @@
469469
"missingSignals": [
470470
"dependencyInjection"
471471
],
472-
"payloadBytes": 65,
473-
"estimatedTokens": 17,
472+
"payloadBytes": 70,
473+
"estimatedTokens": 18,
474474
"toolCallCount": null,
475-
"elapsedMs": 8438
475+
"elapsedMs": 8937
476476
},
477477
{
478478
"taskId": "as-find-02",
@@ -483,10 +483,10 @@
483483
"missingSignals": [
484484
"stateManagement"
485485
],
486-
"payloadBytes": 65,
487-
"estimatedTokens": 17,
486+
"payloadBytes": 75,
487+
"estimatedTokens": 19,
488488
"toolCallCount": null,
489-
"elapsedMs": 8169
489+
"elapsedMs": 8747
490490
},
491491
{
492492
"taskId": "as-find-03",
@@ -499,10 +499,10 @@
499499
"bestExample",
500500
"patterns"
501501
],
502-
"payloadBytes": 70,
503-
"estimatedTokens": 18,
502+
"payloadBytes": 65,
503+
"estimatedTokens": 17,
504504
"toolCallCount": null,
505-
"elapsedMs": 7484
505+
"elapsedMs": 8747
506506
},
507507
{
508508
"taskId": "as-find-04",
@@ -517,7 +517,7 @@
517517
"payloadBytes": 65,
518518
"estimatedTokens": 17,
519519
"toolCallCount": null,
520-
"elapsedMs": 8266
520+
"elapsedMs": 9351
521521
},
522522
{
523523
"taskId": "as-search-01",
@@ -529,10 +529,10 @@
529529
"results",
530530
"searchQuality"
531531
],
532-
"payloadBytes": 65,
533-
"estimatedTokens": 17,
532+
"payloadBytes": 73,
533+
"estimatedTokens": 19,
534534
"toolCallCount": null,
535-
"elapsedMs": 8696
535+
"elapsedMs": 9376
536536
},
537537
{
538538
"taskId": "as-search-02",
@@ -544,10 +544,10 @@
544544
"results",
545545
"searchQuality"
546546
],
547-
"payloadBytes": 65,
548-
"estimatedTokens": 17,
547+
"payloadBytes": 70,
548+
"estimatedTokens": 18,
549549
"toolCallCount": null,
550-
"elapsedMs": 8139
550+
"elapsedMs": 9891
551551
},
552552
{
553553
"taskId": "as-search-03",
@@ -559,10 +559,10 @@
559559
"results",
560560
"searchQuality"
561561
],
562-
"payloadBytes": 95,
563-
"estimatedTokens": 24,
562+
"payloadBytes": 65,
563+
"estimatedTokens": 17,
564564
"toolCallCount": null,
565-
"elapsedMs": 15486
565+
"elapsedMs": 11377
566566
},
567567
{
568568
"taskId": "as-search-04",
@@ -577,7 +577,7 @@
577577
"payloadBytes": 65,
578578
"estimatedTokens": 17,
579579
"toolCallCount": null,
580-
"elapsedMs": 9048
580+
"elapsedMs": 8972
581581
},
582582
{
583583
"taskId": "ex-map-01",
@@ -590,10 +590,10 @@
590590
"architecture",
591591
"statistics"
592592
],
593-
"payloadBytes": 75,
594-
"estimatedTokens": 19,
593+
"payloadBytes": 65,
594+
"estimatedTokens": 17,
595595
"toolCallCount": null,
596-
"elapsedMs": 8162
596+
"elapsedMs": 10195
597597
},
598598
{
599599
"taskId": "ex-map-02",
@@ -609,7 +609,7 @@
609609
"payloadBytes": 65,
610610
"estimatedTokens": 17,
611611
"toolCallCount": null,
612-
"elapsedMs": 9241
612+
"elapsedMs": 8753
613613
},
614614
{
615615
"taskId": "ex-map-03",
@@ -621,10 +621,10 @@
621621
"import aliases",
622622
"tsconfig"
623623
],
624-
"payloadBytes": 19,
625-
"estimatedTokens": 5,
624+
"payloadBytes": 71,
625+
"estimatedTokens": 18,
626626
"toolCallCount": null,
627-
"elapsedMs": 8360
627+
"elapsedMs": 8860
628628
},
629629
{
630630
"taskId": "ex-map-04",
@@ -637,10 +637,10 @@
637637
"libraries actually used",
638638
"generated:"
639639
],
640-
"payloadBytes": 65,
641-
"estimatedTokens": 17,
640+
"payloadBytes": 75,
641+
"estimatedTokens": 19,
642642
"toolCallCount": null,
643-
"elapsedMs": 7935
643+
"elapsedMs": 8623
644644
},
645645
{
646646
"taskId": "ex-find-01",
@@ -651,10 +651,10 @@
651651
"missingSignals": [
652652
"stateManagement"
653653
],
654-
"payloadBytes": 65,
655-
"estimatedTokens": 17,
654+
"payloadBytes": 150,
655+
"estimatedTokens": 38,
656656
"toolCallCount": null,
657-
"elapsedMs": 9621
657+
"elapsedMs": 12098
658658
},
659659
{
660660
"taskId": "ex-find-02",
@@ -667,10 +667,10 @@
667667
"bestExample",
668668
"patterns"
669669
],
670-
"payloadBytes": 75,
671-
"estimatedTokens": 19,
670+
"payloadBytes": 65,
671+
"estimatedTokens": 17,
672672
"toolCallCount": null,
673-
"elapsedMs": 8801
673+
"elapsedMs": 8783
674674
},
675675
{
676676
"taskId": "ex-find-03",
@@ -685,7 +685,7 @@
685685
"payloadBytes": 65,
686686
"estimatedTokens": 17,
687687
"toolCallCount": null,
688-
"elapsedMs": 7509
688+
"elapsedMs": 8785
689689
},
690690
{
691691
"taskId": "ex-find-04",
@@ -696,10 +696,10 @@
696696
"missingSignals": [
697697
"dependencyInjection"
698698
],
699-
"payloadBytes": 65,
700-
"estimatedTokens": 17,
699+
"payloadBytes": 83,
700+
"estimatedTokens": 21,
701701
"toolCallCount": null,
702-
"elapsedMs": 7824
702+
"elapsedMs": 8912
703703
},
704704
{
705705
"taskId": "ex-search-01",
@@ -714,7 +714,7 @@
714714
"payloadBytes": 65,
715715
"estimatedTokens": 17,
716716
"toolCallCount": null,
717-
"elapsedMs": 8208
717+
"elapsedMs": 8043
718718
},
719719
{
720720
"taskId": "ex-search-02",
@@ -726,10 +726,10 @@
726726
"results",
727727
"searchQuality"
728728
],
729-
"payloadBytes": 77,
730-
"estimatedTokens": 20,
729+
"payloadBytes": 75,
730+
"estimatedTokens": 19,
731731
"toolCallCount": null,
732-
"elapsedMs": 9034
732+
"elapsedMs": 8755
733733
},
734734
{
735735
"taskId": "ex-search-03",
@@ -744,7 +744,7 @@
744744
"payloadBytes": 65,
745745
"estimatedTokens": 17,
746746
"toolCallCount": null,
747-
"elapsedMs": 10112
747+
"elapsedMs": 12373
748748
},
749749
{
750750
"taskId": "ex-search-04",
@@ -756,10 +756,10 @@
756756
"results",
757757
"searchQuality"
758758
],
759-
"payloadBytes": 70,
760-
"estimatedTokens": 18,
759+
"payloadBytes": 65,
760+
"estimatedTokens": 17,
761761
"toolCallCount": null,
762-
"elapsedMs": 10160
762+
"elapsedMs": 8802
763763
}
764764
]
765765
}

0 commit comments

Comments
 (0)