Skip to content

Commit 2014151

Browse files
authored
fix(embed): resolve source files from DB root, not cwd (#992)
* fix(embed): resolve source files from DB root, not cwd When `codegraph embed --db <abs-path>` is invoked without a positional directory, the positional `rootDir` defaults to the current working directory. The embed loop then joined every relative node file path against cwd, causing every file read to fail and the command to exit 0 with a misleading "Stored 0 embeddings". Fix: - Persist the repo root (`root_dir`) in the `build_meta` table during build (both WASM/JS finalize and the Rust native orchestrator). - In `buildEmbeddings`, resolve relative file paths against `build_meta.root_dir` first, falling back to the DB's parent directory (`<root>/.codegraph/graph.db` → `<root>`) for DBs built before this change, then finally the caller-provided rootDir. - When every file read fails, throw `DbError` with a clear message so the CLI exits non-zero instead of printing "Stored 0 embeddings" and returning success. Closes #983 Impact: 3 functions changed, 5 affected * fix(embed): restore rootDir fallback when DB is outside .codegraph layout (#992) The prior 'metaRoot || dbParent || rootDir' chain was unreachable past dbParent because path.dirname always returns a non-empty string. This silently ignored an explicit positional <dir> argument for legacy DBs at non-standard locations. Only use dbParent when the DB actually lives at the conventional <root>/.codegraph/graph.db layout; otherwise fall through to the caller-provided rootDir. Impact: 1 functions changed, 1 affected * fix(embed): canonicalize rootDir in JS to match Rust finalize_build (#992) For native full builds, Rust's finalize_build writes root_dir via std::fs::canonicalize (symlink-resolving) but the JS persistBuildMetadata then overwrites it with path.resolve(), which does not resolve symlinks. On systems where the project root is behind a symlink the two values diverge and the JS write wins, reintroducing a non-canonical path. Use fs.realpathSync to match the Rust behaviour, with a safe fallback to path.resolve() if realpath throws. Impact: 1 functions changed, 3 affected * test(embed): wrap temp-dir tests in try/finally to avoid leaks (#992) The ghostRepo and legacyRepo temp dirs were cleaned up inline in the test body, so an unexpected throw before fs.rmSync would leak them. Wrap each test in try/finally so cleanup always runs.
1 parent 6f2cfeb commit 2014151

4 files changed

Lines changed: 177 additions & 2 deletions

File tree

crates/codegraph-core/src/build_pipeline.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,13 @@ fn finalize_build(conn: &Connection, root_dir: &str) -> (i64, i64) {
703703
let _ = stmt.execute(["node_count", &node_count.to_string()]);
704704
let _ = stmt.execute(["edge_count", &edge_count.to_string()]);
705705
let _ = stmt.execute(["last_build", &now_ms().to_string()]);
706+
// Persist repo root so downstream commands (e.g. `codegraph embed`)
707+
// can resolve relative file paths regardless of invoking cwd.
708+
let root_canon = std::fs::canonicalize(root_dir)
709+
.ok()
710+
.and_then(|p| p.to_str().map(|s| s.to_string()))
711+
.unwrap_or_else(|| root_dir.to_string());
712+
let _ = stmt.execute(["root_dir", &root_canon]);
706713
}
707714

708715
// Write journal header

src/domain/graph/builder/stages/finalize.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*
44
* WASM cleanup, stats logging, drift detection, build metadata, registry, journal.
55
*/
6+
import fs from 'node:fs';
67
import { tmpdir } from 'node:os';
78
import path from 'node:path';
89
import { performance } from 'node:perf_hooks';
@@ -88,6 +89,19 @@ function persistBuildMetadata(
8889
// subsequent build to be a full rebuild.
8990
const codeVersionToWrite =
9091
ctx.engineName === 'native' && ctx.engineVersion ? ctx.engineVersion : CODEGRAPH_VERSION;
92+
// Persist the repo root so downstream commands (e.g. `codegraph embed`)
93+
// can resolve relative file paths regardless of the invoking cwd.
94+
// Use realpathSync (symlink-resolving) to match the Rust engine's
95+
// std::fs::canonicalize — otherwise the JS write here would overwrite the
96+
// canonical path Rust wrote for native full builds and could re-introduce
97+
// a non-canonical path when the project root is behind a symlink.
98+
const resolvedRootDir = path.resolve(ctx.rootDir);
99+
let rootDirToWrite = resolvedRootDir;
100+
try {
101+
rootDirToWrite = fs.realpathSync(resolvedRootDir);
102+
} catch {
103+
/* realpath can fail (e.g. path no longer exists); fall back to resolve() */
104+
}
91105
try {
92106
if (useNativeDb) {
93107
ctx.nativeDb!.setBuildMeta(
@@ -99,6 +113,7 @@ function persistBuildMetadata(
99113
built_at: buildNow.toISOString(),
100114
node_count: String(nodeCount),
101115
edge_count: String(actualEdgeCount),
116+
root_dir: rootDirToWrite,
102117
}).map(([key, value]) => ({ key, value: String(value) })),
103118
);
104119
} else {
@@ -110,6 +125,7 @@ function persistBuildMetadata(
110125
built_at: buildNow.toISOString(),
111126
node_count: nodeCount,
112127
edge_count: actualEdgeCount,
128+
root_dir: rootDirToWrite,
113129
});
114130
}
115131
} catch (err) {

src/domain/search/generator.ts

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import fs from 'node:fs';
22
import path from 'node:path';
3-
import { closeDb, findDbPath, openDb } from '../../db/index.js';
3+
import { closeDb, findDbPath, getBuildMeta, openDb } from '../../db/index.js';
44
import { warn } from '../../infrastructure/logger.js';
55
import { DbError } from '../../shared/errors.js';
66
import type { BetterSqlite3Database, NodeRow } from '../../types.js';
@@ -73,6 +73,21 @@ export async function buildEmbeddings(
7373
const db = openDb(dbPath) as BetterSqlite3Database;
7474
initEmbeddingsSchema(db);
7575

76+
// Prefer the repo root recorded at build time — embed may be invoked from a
77+
// different cwd (e.g. `codegraph embed --db /abs/path/graph.db`) and the
78+
// positional rootDir will be wrong in that case. For legacy DBs without
79+
// root_dir metadata, fall back to `<dbParent>` only when the DB lives at
80+
// the conventional `<root>/.codegraph/graph.db` layout — otherwise trust
81+
// the caller-provided rootDir (which may be an explicit positional arg).
82+
// `path.dirname(...)` is always non-empty (`'.'` at minimum), so the
83+
// conventional-layout check is required to keep the rootDir path reachable.
84+
const metaRoot = getBuildMeta(db, 'root_dir');
85+
const resolvedDbPath = path.resolve(dbPath);
86+
const dbDirName = path.basename(path.dirname(resolvedDbPath));
87+
const dbParent =
88+
dbDirName === '.codegraph' ? path.dirname(path.dirname(resolvedDbPath)) : undefined;
89+
const resolvedRoot = metaRoot || dbParent || rootDir;
90+
7691
db.exec('DELETE FROM embeddings');
7792
db.exec('DELETE FROM embedding_meta');
7893
db.exec('DELETE FROM fts_index');
@@ -98,13 +113,17 @@ export async function buildEmbeddings(
98113
const config = getModelConfig(modelKey);
99114
const contextWindow = config.contextWindow;
100115
let overflowCount = 0;
116+
let filesRead = 0;
117+
let filesSkipped = 0;
101118

102119
for (const [file, fileNodes] of byFile) {
103-
const fullPath = path.isAbsolute(file) ? file : path.join(rootDir, file);
120+
const fullPath = path.isAbsolute(file) ? file : path.join(resolvedRoot, file);
104121
let lines: string[];
105122
try {
106123
lines = fs.readFileSync(fullPath, 'utf-8').split('\n');
124+
filesRead++;
107125
} catch (err: unknown) {
126+
filesSkipped++;
108127
warn(`Cannot read ${file} for embeddings: ${(err as Error).message}`);
109128
continue;
110129
}
@@ -136,6 +155,19 @@ export async function buildEmbeddings(
136155
);
137156
}
138157

158+
// If there were symbols to embed but every file failed to read, the DB was
159+
// almost certainly built from a different location than the current cwd.
160+
// Surface this clearly instead of emitting a silent "Stored 0 embeddings".
161+
if (byFile.size > 0 && filesRead === 0) {
162+
closeDb(db);
163+
throw new DbError(
164+
`embed: could not read any of the ${filesSkipped} source files recorded in the graph — the DB may have been built from a different location than the current working directory.\n` +
165+
`Tried resolving against: ${resolvedRoot}\n` +
166+
'Pass a positional <dir> argument pointing at the original repo root, or re-run "codegraph build" from that directory.',
167+
{ file: dbPath },
168+
);
169+
}
170+
139171
console.log(`Embedding ${texts.length} symbols...`);
140172
const { vectors, dim } = await embed(texts, modelKey);
141173

tests/search/embedding-strategy.test.ts

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,126 @@ describe('absolute file paths in DB (#752)', () => {
335335
});
336336
});
337337

338+
describe('embed resolves source files from DB root, not cwd (#983)', () => {
339+
let repoDir: string, otherDir: string, repoDbPath: string;
340+
let originalCwd: string;
341+
342+
beforeAll(() => {
343+
// Repo that was built (files live here)
344+
repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-repo-'));
345+
// Unrelated directory we'll cd into when running embed
346+
otherDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-other-'));
347+
348+
fs.writeFileSync(
349+
path.join(repoDir, 'a.js'),
350+
'export function alpha() { return 1; }\nexport function beta() { return alpha(); }\n',
351+
);
352+
353+
const dbDir = path.join(repoDir, '.codegraph');
354+
fs.mkdirSync(dbDir, { recursive: true });
355+
repoDbPath = path.join(dbDir, 'graph.db');
356+
357+
const db = new Database(repoDbPath);
358+
db.pragma('journal_mode = WAL');
359+
initSchema(db);
360+
// DB stores *relative* file paths (typical of WASM-engine builds)
361+
insertNode(db, 'alpha', 'function', 'a.js', 1, 1);
362+
insertNode(db, 'beta', 'function', 'a.js', 2, 2);
363+
// Persist the repo root as the build pipeline would
364+
db.prepare('INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)').run(
365+
'root_dir',
366+
path.resolve(repoDir),
367+
);
368+
db.close();
369+
370+
originalCwd = process.cwd();
371+
});
372+
373+
afterAll(() => {
374+
try {
375+
process.chdir(originalCwd);
376+
} catch {
377+
/* ignore */
378+
}
379+
if (repoDir) fs.rmSync(repoDir, { recursive: true, force: true });
380+
if (otherDir) fs.rmSync(otherDir, { recursive: true, force: true });
381+
});
382+
383+
test('uses root_dir metadata when embed is invoked from unrelated cwd', async () => {
384+
EMBEDDED_TEXTS.length = 0;
385+
process.chdir(otherDir);
386+
387+
// Simulate the CLI: positional dir defaults to cwd (here: otherDir), DB path is absolute
388+
await buildEmbeddings(process.cwd(), 'minilm', repoDbPath);
389+
390+
expect(EMBEDDED_TEXTS.length).toBe(2);
391+
392+
const db = new Database(repoDbPath, { readonly: true });
393+
const count = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
394+
db.close();
395+
expect(count).toBe(2);
396+
});
397+
398+
test('falls back to <dbPath>/../.. when root_dir metadata is missing', async () => {
399+
// Build a fresh DB without root_dir metadata
400+
const legacyRepo = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-legacy-'));
401+
try {
402+
fs.writeFileSync(path.join(legacyRepo, 'b.js'), 'export function gamma() { return 42; }\n');
403+
const legacyDbDir = path.join(legacyRepo, '.codegraph');
404+
fs.mkdirSync(legacyDbDir, { recursive: true });
405+
const legacyDb = path.join(legacyDbDir, 'graph.db');
406+
407+
const db = new Database(legacyDb);
408+
db.pragma('journal_mode = WAL');
409+
initSchema(db);
410+
insertNode(db, 'gamma', 'function', 'b.js', 1, 1);
411+
// Deliberately NOT writing root_dir — simulates DB built before #983 fix
412+
db.close();
413+
414+
EMBEDDED_TEXTS.length = 0;
415+
process.chdir(otherDir);
416+
await buildEmbeddings(process.cwd(), 'minilm', legacyDb);
417+
418+
expect(EMBEDDED_TEXTS.length).toBe(1);
419+
} finally {
420+
fs.rmSync(legacyRepo, { recursive: true, force: true });
421+
}
422+
});
423+
424+
test('exits non-zero (throws) when no source files can be read', async () => {
425+
// Build a DB pointing at files that no longer exist
426+
const ghostRepo = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-ghost-'));
427+
try {
428+
const ghostDbDir = path.join(ghostRepo, '.codegraph');
429+
fs.mkdirSync(ghostDbDir, { recursive: true });
430+
const ghostDb = path.join(ghostDbDir, 'graph.db');
431+
432+
const db = new Database(ghostDb);
433+
db.pragma('journal_mode = WAL');
434+
initSchema(db);
435+
insertNode(db, 'missing', 'function', 'does-not-exist.js', 1, 1);
436+
db.prepare('INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)').run(
437+
'root_dir',
438+
path.resolve(ghostRepo),
439+
);
440+
db.close();
441+
442+
EMBEDDED_TEXTS.length = 0;
443+
await expect(buildEmbeddings(ghostRepo, 'minilm', ghostDb)).rejects.toThrow(
444+
/could not read any of the .* source files/,
445+
);
446+
447+
// No embeddings were persisted (they would have been overwritten via DELETE)
448+
const readDb = new Database(ghostDb, { readonly: true });
449+
const count = readDb.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
450+
readDb.close();
451+
expect(count).toBe(0);
452+
} finally {
453+
fs.rmSync(ghostRepo, { recursive: true, force: true });
454+
}
455+
});
456+
});
457+
338458
describe('context window overflow detection', () => {
339459
let bigDir: string, bigDbPath: string;
340460

0 commit comments

Comments
 (0)