Skip to content

Commit 6c19628

Browse files
authored
fix(benchmarks): make all comparator lanes cross-platform on Windows (#97) (#97)
* fix(benchmarks): make all comparator lanes cross-platform on Windows All five comparator adapters in scripts/benchmark-comparators.mjs were setup_failed on Windows 11 due to Unix-only shell constructs. This fixes the root causes per-lane so EVAL-02 (real benchmark data) is achievable. Changes by lane: - raw Claude Code: drop `2>/dev/null` from checkInstalled, switch runRawClaudeCode() from execAsync (shell, brittle quoting) to execFileAsync (no shell), add `--output-format json`, raise timeout 60s→120s, change pending_evidence fallback to hard setup_failed - codebase-memory-mcp: replace `which ... 2>/dev/null || npx` with npx-only check (cross-platform), raise initTimeout 5s→10s - jCodeMunch: replace hardcoded python3 with pythonCmd (python on Windows), use `python -m pip install`, raise initTimeout 8s→15s - CodeGraphContext: same pythonCmd consistency fix as jCodeMunch - GrepAI: replace `which grepai 2>/dev/null` with `grepai --version` Adds execFile import + execFileAsync, adds pythonCmd platform constant. * fix(benchmarks): resolve execFileAsync .cmd issue and drop dead exec import On Windows, execFile does not use a shell and cannot resolve npm's .cmd wrappers (e.g. claude.cmd). checkInstalled() succeeded via execSync (which goes through cmd.exe) but runRawClaudeCode threw ENOENT, returning setup_failed on the very platform the previous commit targeted. Add shell: process.platform === 'win32' to the execFileAsync call so cmd.exe is used on Windows (resolves .cmd) while POSIX keeps shell: false (no injection risk, args are already an array). Also removes the dead exec / execAsync imports left over from the shell-interpolated execAsync refactor.
1 parent ae649dd commit 6c19628

1 file changed

Lines changed: 29 additions & 25 deletions

File tree

scripts/benchmark-comparators.mjs

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
import path from 'path';
1515
import { fileURLToPath } from 'url';
1616
import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'fs';
17-
import { execSync, exec } from 'child_process';
17+
import { execSync, execFile } from 'child_process';
1818
import { parseArgs } from 'util';
1919
import { promisify } from 'util';
2020
import { withManagedStdioClientSession } from './lib/managed-mcp-session.mjs';
2121

22-
const execAsync = promisify(exec);
22+
const execFileAsync = promisify(execFile);
2323

2424
const __dirname = path.dirname(fileURLToPath(import.meta.url));
2525
const projectRoot = path.join(__dirname, '..');
@@ -96,15 +96,14 @@ function estimateTokens(bytes) {
9696
* - searchArgs(task): map frozen task to tool arguments
9797
* - extractPayload(result): extract string payload from MCP tool response
9898
*/
99+
const pythonCmd = process.platform === 'win32' ? 'python' : 'python3';
100+
99101
const COMPARATOR_ADAPTERS = [
100102
{
101103
name: 'codebase-memory-mcp',
102104
checkInstalled() {
103105
try {
104-
// Installed via curl installer to ~/.local/bin or similar; also available via npx
105-
execSync('which codebase-memory-mcp 2>/dev/null || npx --yes codebase-memory-mcp --version 2>/dev/null', {
106-
stdio: 'pipe'
107-
});
106+
execSync('npx --yes codebase-memory-mcp --version', { stdio: 'pipe', timeout: 30000 });
108107
return true;
109108
} catch {
110109
return false;
@@ -124,7 +123,7 @@ const COMPARATOR_ADAPTERS = [
124123
serverCommand: 'npx',
125124
serverArgs: ['--yes', 'codebase-memory-mcp'],
126125
serverEnv: {},
127-
initTimeout: 5000,
126+
initTimeout: 10000,
128127
indexTool: null, // auto-indexes on first query
129128
searchTool: 'search_code',
130129
searchArgs(task) {
@@ -141,23 +140,23 @@ const COMPARATOR_ADAPTERS = [
141140
name: 'jCodeMunch',
142141
checkInstalled() {
143142
try {
144-
execSync('python3 -c "import jcodemunch" 2>/dev/null', { stdio: 'pipe' });
143+
execSync(`${pythonCmd} -c "import jcodemunch"`, { stdio: 'pipe' });
145144
return true;
146145
} catch {
147146
return false;
148147
}
149148
},
150149
async install() {
151150
try {
152-
execSync('pip install jcodemunch-mcp', { stdio: 'pipe', timeout: 120000 });
151+
execSync(`${pythonCmd} -m pip install jcodemunch-mcp`, { stdio: 'pipe', timeout: 120000 });
153152
} catch (err) {
154153
throw new Error(`jCodeMunch install failed: ${err.message}`);
155154
}
156155
},
157-
serverCommand: 'python3',
156+
serverCommand: pythonCmd,
158157
serverArgs: ['-m', 'jcodemunch.server'],
159158
serverEnv: {},
160-
initTimeout: 8000,
159+
initTimeout: 15000,
161160
indexTool: 'index_folder',
162161
indexArgs(rootPath) {
163162
return { path: path.resolve(rootPath) };
@@ -182,7 +181,7 @@ const COMPARATOR_ADAPTERS = [
182181
name: 'GrepAI',
183182
checkInstalled() {
184183
try {
185-
execSync('which grepai 2>/dev/null', { stdio: 'pipe' });
184+
execSync('grepai --version', { stdio: 'pipe' });
186185
return true;
187186
} catch {
188187
return false;
@@ -191,7 +190,7 @@ const COMPARATOR_ADAPTERS = [
191190
async install() {
192191
// GrepAI requires a Go binary + Ollama embedding provider. Likely setup_failed without Ollama.
193192
try {
194-
execSync('which grepai', { stdio: 'pipe' });
193+
execSync('grepai --version', { stdio: 'pipe' });
195194
} catch {
196195
throw new Error(
197196
'GrepAI requires Go binary installation (Homebrew: brew install yoanbernabeu/tap/grepai) ' +
@@ -220,23 +219,23 @@ const COMPARATOR_ADAPTERS = [
220219
name: 'CodeGraphContext',
221220
checkInstalled() {
222221
try {
223-
execSync('python3 -c "import codegraphcontext" 2>/dev/null', { stdio: 'pipe' });
222+
execSync(`${pythonCmd} -c "import codegraphcontext"`, { stdio: 'pipe' });
224223
return true;
225224
} catch {
226225
return false;
227226
}
228227
},
229228
async install() {
230229
try {
231-
execSync('pip install codegraphcontext', { stdio: 'pipe', timeout: 120000 });
230+
execSync(`${pythonCmd} -m pip install codegraphcontext`, { stdio: 'pipe', timeout: 120000 });
232231
} catch (err) {
233232
throw new Error(
234233
`CodeGraphContext install failed: ${err.message}. ` +
235234
'Requires Python 3.9+ and either Neo4j or FalkorDB Lite.'
236235
);
237236
}
238237
},
239-
serverCommand: 'python3',
238+
serverCommand: pythonCmd,
240239
serverArgs: ['-m', 'codegraphcontext.server'],
241240
serverEnv: {},
242241
initTimeout: 15000,
@@ -261,16 +260,15 @@ const COMPARATOR_ADAPTERS = [
261260
name: 'raw Claude Code',
262261
checkInstalled() {
263262
try {
264-
execSync('claude --version 2>/dev/null', { stdio: 'pipe' });
263+
execSync('claude --version', { stdio: 'pipe' });
265264
return true;
266265
} catch {
267266
return false;
268267
}
269268
},
270269
async install() {
271270
throw new Error(
272-
'raw Claude Code baseline requires the Claude Code CLI (claude) to be installed and authenticated. ' +
273-
'This is the manual-log-capture baseline — record as pending_evidence if claude CLI is unavailable.'
271+
'raw Claude Code baseline requires the claude CLI. Install: npm install -g @anthropic-ai/claude-code'
274272
);
275273
},
276274
// raw Claude Code is not an MCP server; handled separately via claude -p
@@ -411,11 +409,17 @@ async function runRawClaudeCode(rootPath, tasks) {
411409

412410
try {
413411
const prompt = `You are exploring a codebase at ${path.resolve(rootPath)}. Answer this question using only grep, glob, and read file operations: ${task.prompt}`;
414-
const { stdout } = await execAsync(
415-
`claude -p "${prompt.replace(/"/g, '\\"')}" --allowedTools "Read,Grep,Glob"`,
416-
{ timeout: 60000, cwd: path.resolve(rootPath) }
412+
const { stdout } = await execFileAsync(
413+
'claude',
414+
['-p', prompt, '--output-format', 'json', '--allowedTools', 'Read,Grep,Glob'],
415+
{ timeout: 120000, cwd: path.resolve(rootPath), shell: process.platform === 'win32' }
417416
);
418-
payload = stdout;
417+
try {
418+
const parsed = JSON.parse(stdout);
419+
payload = parsed.result ?? stdout;
420+
} catch {
421+
payload = stdout;
422+
}
419423
} catch (err) {
420424
if (err.code === 'ENOENT' || err.message?.includes('command not found')) {
421425
throw new Error('claude CLI not found');
@@ -510,8 +514,8 @@ async function runComparator(adapter, repoPaths, allFixtures) {
510514
} catch (err) {
511515
if (err.message.includes('claude CLI not found')) {
512516
return {
513-
status: 'pending_evidence',
514-
reason: 'claude CLI not available. Run manually with: claude -p "<task>" --allowedTools "Read,Grep,Glob"'
517+
status: 'setup_failed',
518+
reason: 'claude CLI not found — required for baseline. Install: npm install -g @anthropic-ai/claude-code'
515519
};
516520
}
517521
return { status: 'setup_failed', reason: err.message };

0 commit comments

Comments
 (0)