Skip to content

Commit a9dc26a

Browse files
committed
feat(eval): add edit-preflight discovery lane
1 parent 0458be8 commit a9dc26a

12 files changed

Lines changed: 1459 additions & 80 deletions

scripts/benchmark-comparators.mjs

Lines changed: 409 additions & 56 deletions
Large diffs are not rendered by default.

scripts/lib/managed-mcp-session.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import process from 'node:process';
22

33
async function loadSdkClient() {
44
const [{ Client }, { StdioClientTransport }] = await Promise.all([
5-
import('@modelcontextprotocol/sdk/client/index.js'),
5+
import('@modelcontextprotocol/sdk/client'),
66
import('@modelcontextprotocol/sdk/client/stdio.js')
77
]);
88

scripts/run-eval.mjs

Lines changed: 66 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,33 +11,25 @@ import { analyzerRegistry } from '../dist/core/analyzer-registry.js';
1111
import { AngularAnalyzer } from '../dist/analyzers/angular/index.js';
1212
import { GenericAnalyzer } from '../dist/analyzers/generic/index.js';
1313
import { evaluateFixture, formatEvalReport } from '../dist/eval/harness.js';
14+
import {
15+
combineEditPreflightSummaries,
16+
evaluateEditPreflightFixture,
17+
formatEditPreflightReport
18+
} from '../dist/eval/edit-preflight-harness.js';
1419
import {
1520
combineDiscoverySummaries,
1621
evaluateDiscoveryGate,
1722
evaluateDiscoveryFixture,
1823
formatDiscoveryReport
1924
} from '../dist/eval/discovery-harness.js';
25+
import { getDefaultFixturePaths, resolveEvalMode } from '../dist/eval/run-config.js';
2026

2127
const __dirname = path.dirname(fileURLToPath(import.meta.url));
2228
const projectRoot = path.join(__dirname, '..');
2329
const packageJsonPath = path.join(projectRoot, 'package.json');
2430

2531
const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));
2632

27-
const defaultFixtureA = path.join(projectRoot, 'tests', 'fixtures', 'eval-angular-spotify.json');
28-
const defaultFixtureB = path.join(projectRoot, 'tests', 'fixtures', 'eval-controlled.json');
29-
const defaultDiscoveryFixtureA = path.join(
30-
projectRoot,
31-
'tests',
32-
'fixtures',
33-
'discovery-angular-spotify.json'
34-
);
35-
const defaultDiscoveryFixtureB = path.join(
36-
projectRoot,
37-
'tests',
38-
'fixtures',
39-
'discovery-excalidraw.json'
40-
);
4133
const defaultDiscoveryProtocol = path.join(
4234
projectRoot,
4335
'tests',
@@ -49,7 +41,7 @@ const usage = [
4941
`Usage: node scripts/run-eval.mjs <codebaseA> [codebaseB] [options]`,
5042
``,
5143
`Options:`,
52-
` --mode=<retrieval|discovery> Select benchmark mode (default: retrieval)`,
44+
` --mode=<retrieval|discovery|edit-preflight> Select benchmark mode (default: retrieval)`,
5345
` --fixture-a=<path> Override fixture for codebaseA`,
5446
` --fixture-b=<path> Override fixture for codebaseB`,
5547
` --protocol=<path> Override discovery benchmark protocol`,
@@ -151,6 +143,17 @@ async function runSingleEvaluation({
151143
fixturePath: resolvedFixture,
152144
summary
153145
});
146+
} else if (mode === 'edit-preflight') {
147+
console.log(`\n--- Phase 2: Running ${fixture.tasks.length}-task edit-preflight harness ---`);
148+
summary = await evaluateEditPreflightFixture({
149+
fixture,
150+
rootPath: resolvedCodebase
151+
});
152+
report = formatEditPreflightReport({
153+
codebaseLabel: label,
154+
fixturePath: resolvedFixture,
155+
summary
156+
});
154157
} else {
155158
console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`);
156159
const searcher = new CodebaseSearcher(resolvedCodebase);
@@ -202,6 +205,31 @@ function printCombinedSummary(summaries, mode) {
202205
return;
203206
}
204207

208+
if (mode === 'edit-preflight') {
209+
const combined = combineEditPreflightSummaries(summaries);
210+
console.log(`\n=== Combined Edit Preflight Summary ===`);
211+
console.log(
212+
`Top-target in top-3: ${combined.topTargetInTop3Count}/${combined.targetableTasks} (${combined.topTargetInTop3Rate === null ? 'n/a' : (combined.topTargetInTop3Rate * 100).toFixed(0) + '%'})`
213+
);
214+
console.log(
215+
`Average first relevant hit: ${combined.averageFirstRelevantHit === null ? 'n/a' : combined.averageFirstRelevantHit.toFixed(2)}`
216+
);
217+
console.log(
218+
`Best-example hit rate: ${combined.bestExampleHitCount}/${combined.bestExampleTasks} (${combined.bestExampleHitRate === null ? 'n/a' : (combined.bestExampleHitRate * 100).toFixed(0) + '%'})`
219+
);
220+
console.log(
221+
`Safe ready rate: ${combined.safeTaskReadyCount}/${combined.safeTasks} (${combined.safeTaskReadyRate === null ? 'n/a' : (combined.safeTaskReadyRate * 100).toFixed(0) + '%'})`
222+
);
223+
console.log(
224+
`Unsafe abstain rate: ${combined.unsafeTaskAbstainCount}/${combined.unsafeTasks} (${combined.unsafeTaskAbstainRate === null ? 'n/a' : (combined.unsafeTaskAbstainRate * 100).toFixed(0) + '%'})`
225+
);
226+
console.log(
227+
`Unsafe ready=true false positives: ${combined.unsafeReadyFalsePositiveCount}/${combined.unsafeTasks} (${combined.unsafeReadyFalsePositiveRate === null ? 'n/a' : (combined.unsafeReadyFalsePositiveRate * 100).toFixed(0) + '%'})`
228+
);
229+
console.log(`=======================================\n`);
230+
return;
231+
}
232+
205233
const total = summaries.reduce((sum, summary) => sum + summary.total, 0);
206234
const top1Correct = summaries.reduce((sum, summary) => sum + summary.top1Correct, 0);
207235
const top3RecallCount = summaries.reduce((sum, summary) => sum + summary.top3RecallCount, 0);
@@ -254,17 +282,14 @@ async function main() {
254282

255283
const codebaseA = positionals[0];
256284
const codebaseB = positionals[1];
257-
const mode = values.mode === 'discovery' ? 'discovery' : 'retrieval';
285+
const mode = resolveEvalMode(values.mode);
286+
const defaultFixtures = getDefaultFixturePaths(projectRoot, mode);
258287
const fixtureA = values['fixture-a']
259288
? path.resolve(values['fixture-a'])
260-
: mode === 'discovery'
261-
? defaultDiscoveryFixtureA
262-
: defaultFixtureA;
289+
: defaultFixtures.fixtureA;
263290
const fixtureB = values['fixture-b']
264291
? path.resolve(values['fixture-b'])
265-
: mode === 'discovery'
266-
? defaultDiscoveryFixtureB
267-
: defaultFixtureB;
292+
: defaultFixtures.fixtureB;
268293
const protocolPath = values.protocol
269294
? path.resolve(values.protocol)
270295
: defaultDiscoveryProtocol;
@@ -326,6 +351,25 @@ async function main() {
326351
process.exit(gate.status === 'failed' ? 1 : 0);
327352
}
328353

354+
if (mode === 'edit-preflight') {
355+
const combinedSummary = combineEditPreflightSummaries(summaries);
356+
printCombinedSummary(summaries, mode);
357+
console.log(
358+
formatEditPreflightReport({
359+
codebaseLabel: 'combined-suite',
360+
fixturePath: codebaseB ? `${fixtureA}, ${fixtureB}` : fixtureA,
361+
summary: combinedSummary
362+
})
363+
);
364+
if (outputPath) {
365+
const outputDir = path.dirname(outputPath);
366+
if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
367+
writeFileSync(outputPath, JSON.stringify(combinedSummary, null, 2));
368+
console.log(`\nResults written to: ${outputPath}`);
369+
}
370+
process.exit(0);
371+
}
372+
329373
if (outputPath && mode === 'discovery' && summaries.length === 1) {
330374
const outputDir = path.dirname(outputPath);
331375
if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });

0 commit comments

Comments
 (0)