@@ -310,3 +310,98 @@ describe('raw Claude result parsing', () => {
310310 expect ( parsed . bestExample ) . toBe ( 'src/auth/auth.interceptor.ts' ) ;
311311 } ) ;
312312} ) ;
313+
314+ describe ( 'benchmark comparator aggregation' , ( ) => {
315+ it ( 'marks empty task payloads as pending evidence instead of ok' , async ( ) => {
316+ const { aggregateResults } = await importRunner ( ) ;
317+ const aggregated = aggregateResults ( [
318+ {
319+ taskId : 't1' ,
320+ job : 'search' ,
321+ surface : 'search_codebase' ,
322+ usefulnessScore : 0 ,
323+ matchedSignals : [ ] ,
324+ missingSignals : [ 'results' ] ,
325+ payloadBytes : 19 ,
326+ estimatedTokens : 5 ,
327+ toolCallCount : 1 ,
328+ elapsedMs : 1
329+ }
330+ ] ) ;
331+
332+ expect ( aggregated . status ) . toBe ( 'pending_evidence' ) ;
333+ expect ( aggregated . reason ) . toMatch ( / u s a b l e b e n c h m a r k e v i d e n c e / i) ;
334+ expect ( aggregated . averageFirstRelevantHit ) . toBeNull ( ) ;
335+ expect ( aggregated . bestExampleUsefulnessRate ) . toBeNull ( ) ;
336+ } ) ;
337+
338+ it ( 'computes ranked-hit and best-example metrics when task evidence exists' , async ( ) => {
339+ const { aggregateResults } = await importRunner ( ) ;
340+ const aggregated = aggregateResults ( [
341+ {
342+ taskId : 'search-1' ,
343+ job : 'search' ,
344+ surface : 'search_codebase' ,
345+ usefulnessScore : 0.5 ,
346+ matchedSignals : [ 'results' ] ,
347+ missingSignals : [ 'searchQuality' ] ,
348+ payloadBytes : 200 ,
349+ estimatedTokens : 50 ,
350+ toolCallCount : 1 ,
351+ elapsedMs : 10 ,
352+ firstRelevantHit : 2
353+ } ,
354+ {
355+ taskId : 'find-1' ,
356+ job : 'find' ,
357+ surface : 'search_codebase' ,
358+ usefulnessScore : 1 ,
359+ matchedSignals : [ 'bestExample' ] ,
360+ missingSignals : [ ] ,
361+ payloadBytes : 220 ,
362+ estimatedTokens : 55 ,
363+ toolCallCount : 1 ,
364+ elapsedMs : 12 ,
365+ bestExampleUseful : true
366+ }
367+ ] ) ;
368+
369+ expect ( aggregated . status ) . toBe ( 'ok' ) ;
370+ expect ( aggregated . averageFirstRelevantHit ) . toBe ( 2 ) ;
371+ expect ( aggregated . bestExampleUsefulnessRate ) . toBe ( 1 ) ;
372+ } ) ;
373+ } ) ;
374+
375+ describe ( 'raw Claude result parsing' , ( ) => {
376+ it ( 'extracts files and bestExample from structured Claude output' , async ( ) => {
377+ const { parseRawClaudeStructuredResult } = await importRunner ( ) ;
378+ const parsed = parseRawClaudeStructuredResult (
379+ JSON . stringify ( {
380+ answer : 'Use AuthInterceptor and auth.effects patterns.' ,
381+ files : [ 'src/auth/auth.interceptor.ts' , 'src/auth/auth.effects.ts' ] ,
382+ bestExample : 'src/auth/auth.interceptor.ts'
383+ } )
384+ ) ;
385+
386+ expect ( parsed . payload ) . toContain ( 'AuthInterceptor' ) ;
387+ expect ( parsed . topFiles ) . toEqual ( [
388+ 'src/auth/auth.interceptor.ts' ,
389+ 'src/auth/auth.effects.ts'
390+ ] ) ;
391+ expect ( parsed . bestExample ) . toBe ( 'src/auth/auth.interceptor.ts' ) ;
392+ } ) ;
393+
394+ it ( 'extracts files and bestExample from fenced JSON Claude output' , async ( ) => {
395+ const { parseRawClaudeStructuredResult } = await importRunner ( ) ;
396+ const parsed = parseRawClaudeStructuredResult ( `\`\`\`json
397+ {"answer":"Use AuthInterceptor and auth.effects patterns.","files":["src/auth/auth.interceptor.ts","src/auth/auth.effects.ts"],"bestExample":"src/auth/auth.interceptor.ts"}
398+ \`\`\`` ) ;
399+
400+ expect ( parsed . payload ) . toContain ( 'AuthInterceptor' ) ;
401+ expect ( parsed . topFiles ) . toEqual ( [
402+ 'src/auth/auth.interceptor.ts' ,
403+ 'src/auth/auth.effects.ts'
404+ ] ) ;
405+ expect ( parsed . bestExample ) . toBe ( 'src/auth/auth.interceptor.ts' ) ;
406+ } ) ;
407+ } ) ;
0 commit comments