Skip to content

Commit 8a21806

Browse files
committed
feat(eval): add edit-preflight discovery lane
1 parent 2e02165 commit 8a21806

2 files changed

Lines changed: 96 additions & 1 deletion

File tree

scripts/lib/managed-mcp-session.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const execFileAsync = promisify(execFile);
66

77
async function loadSdkClient() {
88
const [{ Client }, { StdioClientTransport }] = await Promise.all([
9-
import('@modelcontextprotocol/sdk/client/index.js'),
9+
import('@modelcontextprotocol/sdk/client'),
1010
import('@modelcontextprotocol/sdk/client/stdio.js')
1111
]);
1212

tests/benchmark-comparators.test.ts

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,3 +310,98 @@ describe('raw Claude result parsing', () => {
310310
expect(parsed.bestExample).toBe('src/auth/auth.interceptor.ts');
311311
});
312312
});
313+
314+
describe('benchmark comparator aggregation', () => {
315+
it('marks empty task payloads as pending evidence instead of ok', async () => {
316+
const { aggregateResults } = await importRunner();
317+
const aggregated = aggregateResults([
318+
{
319+
taskId: 't1',
320+
job: 'search',
321+
surface: 'search_codebase',
322+
usefulnessScore: 0,
323+
matchedSignals: [],
324+
missingSignals: ['results'],
325+
payloadBytes: 19,
326+
estimatedTokens: 5,
327+
toolCallCount: 1,
328+
elapsedMs: 1
329+
}
330+
]);
331+
332+
expect(aggregated.status).toBe('pending_evidence');
333+
expect(aggregated.reason).toMatch(/usable benchmark evidence/i);
334+
expect(aggregated.averageFirstRelevantHit).toBeNull();
335+
expect(aggregated.bestExampleUsefulnessRate).toBeNull();
336+
});
337+
338+
it('computes ranked-hit and best-example metrics when task evidence exists', async () => {
339+
const { aggregateResults } = await importRunner();
340+
const aggregated = aggregateResults([
341+
{
342+
taskId: 'search-1',
343+
job: 'search',
344+
surface: 'search_codebase',
345+
usefulnessScore: 0.5,
346+
matchedSignals: ['results'],
347+
missingSignals: ['searchQuality'],
348+
payloadBytes: 200,
349+
estimatedTokens: 50,
350+
toolCallCount: 1,
351+
elapsedMs: 10,
352+
firstRelevantHit: 2
353+
},
354+
{
355+
taskId: 'find-1',
356+
job: 'find',
357+
surface: 'search_codebase',
358+
usefulnessScore: 1,
359+
matchedSignals: ['bestExample'],
360+
missingSignals: [],
361+
payloadBytes: 220,
362+
estimatedTokens: 55,
363+
toolCallCount: 1,
364+
elapsedMs: 12,
365+
bestExampleUseful: true
366+
}
367+
]);
368+
369+
expect(aggregated.status).toBe('ok');
370+
expect(aggregated.averageFirstRelevantHit).toBe(2);
371+
expect(aggregated.bestExampleUsefulnessRate).toBe(1);
372+
});
373+
});
374+
375+
describe('raw Claude result parsing', () => {
376+
it('extracts files and bestExample from structured Claude output', async () => {
377+
const { parseRawClaudeStructuredResult } = await importRunner();
378+
const parsed = parseRawClaudeStructuredResult(
379+
JSON.stringify({
380+
answer: 'Use AuthInterceptor and auth.effects patterns.',
381+
files: ['src/auth/auth.interceptor.ts', 'src/auth/auth.effects.ts'],
382+
bestExample: 'src/auth/auth.interceptor.ts'
383+
})
384+
);
385+
386+
expect(parsed.payload).toContain('AuthInterceptor');
387+
expect(parsed.topFiles).toEqual([
388+
'src/auth/auth.interceptor.ts',
389+
'src/auth/auth.effects.ts'
390+
]);
391+
expect(parsed.bestExample).toBe('src/auth/auth.interceptor.ts');
392+
});
393+
394+
it('extracts files and bestExample from fenced JSON Claude output', async () => {
395+
const { parseRawClaudeStructuredResult } = await importRunner();
396+
const parsed = parseRawClaudeStructuredResult(`\`\`\`json
397+
{"answer":"Use AuthInterceptor and auth.effects patterns.","files":["src/auth/auth.interceptor.ts","src/auth/auth.effects.ts"],"bestExample":"src/auth/auth.interceptor.ts"}
398+
\`\`\``);
399+
400+
expect(parsed.payload).toContain('AuthInterceptor');
401+
expect(parsed.topFiles).toEqual([
402+
'src/auth/auth.interceptor.ts',
403+
'src/auth/auth.effects.ts'
404+
]);
405+
expect(parsed.bestExample).toBe('src/auth/auth.interceptor.ts');
406+
});
407+
});

0 commit comments

Comments
 (0)