Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 93 additions & 109 deletions scripts/benchmark-comparators.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'fs';
import { execSync, exec } from 'child_process';
import { parseArgs } from 'util';
import { promisify } from 'util';
import { withManagedStdioClientSession } from './lib/managed-mcp-session.mjs';

const execAsync = promisify(exec);

Expand Down Expand Up @@ -293,125 +294,108 @@ const COMPARATOR_ADAPTERS = [
// ---------------------------------------------------------------------------

async function runComparatorViaMcp(adapter, rootPath, tasks) {
let Client, StdioClientTransport;
try {
({ Client } = await import('@modelcontextprotocol/sdk/client/index.js'));
({ StdioClientTransport } = await import('@modelcontextprotocol/sdk/client/stdio.js'));
} catch (err) {
throw new Error(`MCP SDK not available: ${err.message}`);
}

const transport = new StdioClientTransport({
command: adapter.serverCommand,
args: adapter.serverArgs,
env: { ...process.env, ...adapter.serverEnv }
});

const client = new Client({ name: 'benchmark-runner', version: '1.0.0' });

try {
await client.connect(transport);
} catch (err) {
throw new Error(`Failed to connect to ${adapter.name} MCP server: ${err.message}`);
}
return withManagedStdioClientSession(
{
serverCommand: adapter.serverCommand,
serverArgs: adapter.serverArgs,
serverEnv: adapter.serverEnv,
connectTimeoutMs: adapter.connectTimeout ?? 15_000
},
async ({ client }) => {
if (adapter.initTimeout > 0) {
await new Promise((resolve) => setTimeout(resolve, adapter.initTimeout));
}

// Wait for server to be ready
if (adapter.initTimeout > 0) {
await new Promise((resolve) => setTimeout(resolve, adapter.initTimeout));
}
let availableTools = [];
try {
const toolsResult = await client.listTools();
availableTools = toolsResult.tools ?? [];
} catch (err) {
throw new Error(`Failed to list tools from ${adapter.name}: ${err.message}`);
}

// Discover tools
let availableTools = [];
try {
const toolsResult = await client.listTools();
availableTools = toolsResult.tools ?? [];
} catch (err) {
await client.close();
throw new Error(`Failed to list tools from ${adapter.name}: ${err.message}`);
}
const toolNames = availableTools.map((t) => t.name);
let searchToolName = adapter.searchTool;
if (!searchToolName) {
searchToolName =
toolNames.find((n) => n.includes('search') || n.includes('query') || n.includes('find')) ??
null;
}

const toolNames = availableTools.map((t) => t.name);
if (!searchToolName || !toolNames.includes(searchToolName)) {
throw new Error(
`No suitable search tool found for ${adapter.name}. Available: ${toolNames.join(', ')}`
);
}

// Determine search tool (may be dynamic for CodeGraphContext)
let searchToolName = adapter.searchTool;
if (!searchToolName) {
// Try to find a search-like tool
searchToolName =
toolNames.find((n) => n.includes('search') || n.includes('query') || n.includes('find')) ??
null;
}
let totalToolCalls = 0;
if (adapter.indexTool && toolNames.includes(adapter.indexTool)) {
console.log(` [${adapter.name}] Indexing ${path.basename(rootPath)}...`);
const indexStartMs = Date.now();
try {
await Promise.race([
client.callTool({ name: adapter.indexTool, arguments: adapter.indexArgs(rootPath) }),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Index timeout')), adapter.indexTimeout ?? 120000)
)
]);
totalToolCalls++;
console.log(` [${adapter.name}] Index complete in ${Date.now() - indexStartMs}ms`);
} catch (err) {
throw new Error(`${adapter.name} indexing failed: ${err.message}`);
}
}

if (!searchToolName || !toolNames.includes(searchToolName)) {
await client.close();
throw new Error(
`No suitable search tool found for ${adapter.name}. Available: ${toolNames.join(', ')}`
);
}
const taskResults = [];
for (const task of tasks) {
const startMs = Date.now();
let payload = '';
let toolCallCount = totalToolCalls;

try {
const result = await client.callTool({
name: searchToolName,
arguments: adapter.searchArgs(task)
});
toolCallCount++;
payload = adapter.extractPayload(result);
} catch (err) {
console.warn(` [${adapter.name}] Task ${task.id} failed: ${err.message}`);
payload = '';
}

// Index the repo if needed
let totalToolCalls = 0;
if (adapter.indexTool && toolNames.includes(adapter.indexTool)) {
console.log(` [${adapter.name}] Indexing ${path.basename(rootPath)}...`);
const indexStartMs = Date.now();
try {
await Promise.race([
client.callTool({ name: adapter.indexTool, arguments: adapter.indexArgs(rootPath) }),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Index timeout')), adapter.indexTimeout ?? 120000)
)
]);
totalToolCalls++;
console.log(` [${adapter.name}] Index complete in ${Date.now() - indexStartMs}ms`);
} catch (err) {
await client.close();
throw new Error(`${adapter.name} indexing failed: ${err.message}`);
}
}
const elapsedMs = Date.now() - startMs;
const payloadBytes = countUtf8Bytes(payload);
const estimatedTokens = estimateTokens(payloadBytes);
const { usefulnessScore, matchedSignals, missingSignals } = matchSignals(
payload,
task.expectedSignals,
task.forbiddenSignals
);

// Run each task
const taskResults = [];
for (const task of tasks) {
const startMs = Date.now();
let payload = '';
let toolCallCount = totalToolCalls;
taskResults.push({
taskId: task.id,
job: task.job,
surface: task.surface,
usefulnessScore,
matchedSignals,
missingSignals,
payloadBytes,
estimatedTokens,
toolCallCount,
elapsedMs
});
}

try {
const result = await client.callTool({
name: searchToolName,
arguments: adapter.searchArgs(task)
});
toolCallCount++;
payload = adapter.extractPayload(result);
} catch (err) {
console.warn(` [${adapter.name}] Task ${task.id} failed: ${err.message}`);
payload = '';
return taskResults;
}

const elapsedMs = Date.now() - startMs;
const payloadBytes = countUtf8Bytes(payload);
const estimatedTokens = estimateTokens(payloadBytes);
const { usefulnessScore, matchedSignals, missingSignals } = matchSignals(
payload,
task.expectedSignals,
task.forbiddenSignals
).catch((err) => {
throw new Error(
err.message.includes('timed out')
? `Failed to connect to ${adapter.name} MCP server: ${err.message}`
: err.message
);

taskResults.push({
taskId: task.id,
job: task.job,
surface: task.surface,
usefulnessScore,
matchedSignals,
missingSignals,
payloadBytes,
estimatedTokens,
toolCallCount,
elapsedMs
});
}

await client.close();
return taskResults;
});
Comment on lines +392 to +398
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Dead-code branch in catch handler

err.message.startsWith('Failed to connect to') can never be true here. withManagedStdioClientSession either throws "MCP client connect timed out after Xs" (timeout path) or re-throws errors from the callback, none of which start with "Failed to connect to". The guard is harmless but misleading; removing it makes the intent clearer:

Suggested change
).catch((err) => {
if (err.message.startsWith('Failed to connect to')) {
throw err;
}
const elapsedMs = Date.now() - startMs;
const payloadBytes = countUtf8Bytes(payload);
const estimatedTokens = estimateTokens(payloadBytes);
const { usefulnessScore, matchedSignals, missingSignals } = matchSignals(
payload,
task.expectedSignals,
task.forbiddenSignals
throw new Error(
err.message.includes('timed out')
? `Failed to connect to ${adapter.name} MCP server: ${err.message}`
: err.message
);
taskResults.push({
taskId: task.id,
job: task.job,
surface: task.surface,
usefulnessScore,
matchedSignals,
missingSignals,
payloadBytes,
estimatedTokens,
toolCallCount,
elapsedMs
});
}
await client.close();
return taskResults;
});
}).catch((err) => {
throw new Error(
err.message.includes('timed out')
? `Failed to connect to ${adapter.name} MCP server: ${err.message}`
: err.message
);
});

}

// ---------------------------------------------------------------------------
Expand Down
104 changes: 104 additions & 0 deletions scripts/lib/managed-mcp-session.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import process from 'node:process';

async function loadSdkClient() {
const [{ Client }, { StdioClientTransport }] = await Promise.all([
import('@modelcontextprotocol/sdk/client/index.js'),
import('@modelcontextprotocol/sdk/client/stdio.js')
]);

return { Client, StdioClientTransport };
}

function createTimeoutError(timeoutMs) {
const seconds = Math.max(1, Math.round(timeoutMs / 1000));
return new Error(`MCP client connect timed out after ${seconds}s`);
}

function withTimeout(promise, timeoutMs) {
if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) {
return promise;
}

let timer;
const timeoutPromise = new Promise((_, reject) => {
timer = setTimeout(() => reject(createTimeoutError(timeoutMs)), timeoutMs);
timer.unref?.();
});

return Promise.race([promise, timeoutPromise]).finally(() => {
if (timer) {
clearTimeout(timer);
}
});
}

function delay(timeoutMs) {
return new Promise((resolve) => {
const timer = setTimeout(resolve, timeoutMs);
timer.unref?.();
});
}

async function safeClose(client, transport, connected) {
const closeAttempts = [];

if (connected) {
closeAttempts.push(client.close().catch(() => undefined));
}

closeAttempts.push(transport.close?.().catch(() => undefined));
await Promise.all(closeAttempts);
}

export async function withManagedStdioClientSession(options, callback) {
const {
serverCommand,
serverArgs = [],
serverEnv = {},
cwd,
clientInfo = { name: 'benchmark-runner', version: '1.0.0' },
connectTimeoutMs = 10_000,
onSpawn
} = options;

const { Client, StdioClientTransport } = await loadSdkClient();
const transport = new StdioClientTransport({
command: serverCommand,
args: serverArgs,
env: { ...process.env, ...serverEnv },
cwd
});
const client = new Client(clientInfo);

let connected = false;
let settling = false;
const connectPromise = client.connect(transport);
const spawnNotification = (async () => {
if (typeof onSpawn !== 'function') {
return;
}

while (!settling) {
if (transport.pid !== null) {
onSpawn(transport.pid);
return;
}
await new Promise((resolve) => setTimeout(resolve, 10));
}

if (transport.pid !== null) {
onSpawn(transport.pid);
}
})();
Comment on lines +73 to +92
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 onSpawn can be silently dropped due to settling race

The polling loop exits the moment settling = true is set in the finally block. If the child process spawns during the 10 ms sleep that happens to span the settling = true assignment, the loop re-evaluates while (!settling), sees true, and exits without ever calling onSpawn — even though transport.pid is now populated. In the connect-timeout test this leaves pid === null, causing expect(pid).toBeTypeOf('number') to fail.

In practice, with a 200 ms timeout and a local process that spawns in < 50 ms, the window is very narrow. But under CI load or on slow machines the test can become flaky. A robust fix is to perform one final pid check after the loop exits:

const spawnNotification = (async () => {
  if (typeof onSpawn !== 'function') return;

  while (!settling) {
    if (transport.pid !== null) {
      onSpawn(transport.pid);
      return;
    }
    await new Promise((resolve) => setTimeout(resolve, 10));
  }
  // One final check: process may have spawned while we were asleep
  if (transport.pid !== null) {
    onSpawn(transport.pid);
  }
})();


try {
await withTimeout(connectPromise, connectTimeoutMs);
connected = true;
return await callback({ client, transport });
} finally {
settling = true;
await safeClose(client, transport, connected);
await spawnNotification.catch(() => undefined);
await Promise.race([connectPromise, delay(5_000)]).catch(() => undefined);
}
}
Loading
Loading