Skip to content

Commit f6f5482

Browse files
authored
fix(native): backfill silently-dropped files via WASM for engine parity (#970)
* fix(native): backfill silently-dropped files via WASM for engine parity The native engine's `parse_files_parallel` (Rust) uses `filter_map` with the `?` operator, which silently drops any file whose read/parse/extract step fails — including files in languages where the installed native addon lacks an extractor (e.g. HCL/Scala/Swift on v3.9.2). WASM handles these languages correctly, so benchmarks showed 668 native vs 728 WASM file nodes on the same tree (#967). Fix the parity gap in two places: - `parseFilesAuto`: track which files the native parser actually returned; re-parse the rest with WASM so the JS pipeline gets complete coverage. - `tryNativeOrchestrator`: after the native Rust orchestrator writes to the DB, diff `collectFiles()` against the `kind='file'` rows and backfill any missing files via a WASM pass + `batchInsertNodes`. Both paths warn explicitly when a drop happens so the underlying Rust regression stays visible. This complements the Rust-side fix (adding HCL/Scala/Swift extractors in newer binaries) by protecting users who still have older native addons installed. * fix: address review feedback on native backfill (#970) - Scope backfill/warn to installed WASM grammars. Extract getInstalledWasmExtensions() helper so the parity check in parseFilesAuto and the post-orchestrator backfill both ignore languages neither engine can parse (keeps warn count meaningful). - Populate qualified_name and scope on backfilled symbol nodes so cross-file reference resolution and go-to-definition queries can find them (matches insertDefinitionsAndExports). - Only run the post-orchestrator filesystem scan on full builds. Incremental builds parse through parseFilesAuto, which already has its own per-file backfill, so the extra scan was wasted work. - Fix import ordering flagged by biome. Impact: 4 functions changed, 14 affected * fix(native): backfill exported symbols in WASM-dropped-files path (#970) The engine-parity backfill in `backfillNativeDroppedFiles` was only inserting `symbols.definitions` and skipping `symbols.exports`. Queries filtering on `exported = 1` (e.g. dead-export detection, reverse-dep walks, "exports of file") would miss symbols from backfilled files — HCL/Scala/Swift on stale native addons. Mirror the reference flow in `src/domain/graph/builder/stages/insert-nodes.ts:insertDefinitionsAndExports`: - Iterate `symbols.exports ?? []`, push an `INSERT OR IGNORE` row (no-op if a definition row for the same name/kind/line already exists), and collect the (name, kind, file, line) key for a second pass. - After `batchInsertNodes`, run chunked `UPDATE nodes SET exported = 1 WHERE …` statements (chunk size 500 with a prepared-statement cache) so backfilled exports are discoverable. Also fix a smaller parity gap in the file row: `qualified_name` was `relPath`; the reference flow uses `NULL`. Matches the canonical insert so `kind='file'` rows are shape-identical across both paths. Addresses Greptile P1 (comment 3107870224) on PR #970. Impact: 1 functions changed, 4 affected
1 parent 9be31e1 commit f6f5482

2 files changed

Lines changed: 158 additions & 2 deletions

File tree

src/domain/graph/builder/pipeline.ts

Lines changed: 117 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import { detectWorkspaces, loadConfig } from '../../../infrastructure/config.js'
2121
import { debug, info, warn } from '../../../infrastructure/logger.js';
2222
import { loadNative } from '../../../infrastructure/native.js';
2323
import { semverCompare } from '../../../infrastructure/update-check.js';
24+
import { normalizePath } from '../../../shared/constants.js';
2425
import { toErrorMessage } from '../../../shared/errors.js';
2526
import { CODEGRAPH_VERSION } from '../../../shared/version.js';
2627
import type {
@@ -29,11 +30,12 @@ import type {
2930
BuildResult,
3031
Definition,
3132
ExtractorOutput,
33+
SqliteStatement,
3234
} from '../../../types.js';
33-
import { getActiveEngine } from '../../parser.js';
35+
import { getActiveEngine, getInstalledWasmExtensions, parseFilesAuto } from '../../parser.js';
3436
import { setWorkspaces } from '../resolve.js';
3537
import { PipelineContext } from './context.js';
36-
import { loadPathAliases } from './helpers.js';
38+
import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
3739
import { NativeDbProxy } from './native-db-proxy.js';
3840
import { buildEdges } from './stages/build-edges.js';
3941
import { buildStructure } from './stages/build-structure.js';
@@ -696,10 +698,123 @@ async function tryNativeOrchestrator(
696698
}
697699
}
698700

701+
// Engine parity: the native orchestrator silently drops files whose
702+
// Rust extractor/grammar is missing or fails (e.g. HCL, Scala, Swift on
703+
// stale native binaries). WASM handles those — backfill via WASM so both
704+
// engines process the same file set (#967).
705+
//
706+
// Only runs on full builds: incremental builds only touch changed files,
707+
// which are parsed through parseFilesAuto (which has its own per-file
708+
// backfill), so a full filesystem scan here would be wasted work.
709+
if (result.isFullBuild) {
710+
await backfillNativeDroppedFiles(ctx);
711+
}
712+
699713
closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
700714
return formatNativeTimingResult(p, structurePatchMs, analysisTiming);
701715
}
702716

717+
/**
718+
* Backfill files that the native orchestrator silently dropped during parse.
719+
* Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
720+
*/
721+
async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
722+
// Needs a real better-sqlite3 connection for INSERT.
723+
if (ctx.nativeFirstProxy) {
724+
closeNativeDb(ctx, 'pre-parity-backfill');
725+
ctx.db = openDb(ctx.dbPath);
726+
ctx.nativeFirstProxy = false;
727+
}
728+
729+
const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set<string>());
730+
const expected = new Set(
731+
collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))),
732+
);
733+
734+
const existingRows = ctx.db
735+
.prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'")
736+
.all() as Array<{ file: string }>;
737+
const existing = new Set(existingRows.map((r) => r.file));
738+
739+
// Restrict backfill to files with an installed WASM grammar. Extensions in
740+
// LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on
741+
// minimal installs) can't be parsed by either engine, so they're not a
742+
// native regression — excluding them keeps the warn count meaningful.
743+
const installedExts = getInstalledWasmExtensions();
744+
const missingAbs: string[] = [];
745+
for (const rel of expected) {
746+
if (existing.has(rel)) continue;
747+
const ext = path.extname(rel).toLowerCase();
748+
if (!installedExts.has(ext)) continue;
749+
missingAbs.push(path.join(ctx.rootDir, rel));
750+
}
751+
if (missingAbs.length === 0) return;
752+
753+
warn(
754+
`Native orchestrator dropped ${missingAbs.length} file(s); backfilling via WASM for engine parity`,
755+
);
756+
const wasmResults = await parseFilesAuto(missingAbs, ctx.rootDir, { engine: 'wasm' });
757+
758+
const rows: unknown[][] = [];
759+
const exportKeys: unknown[][] = [];
760+
for (const [relPath, symbols] of wasmResults) {
761+
// File row — mirrors insertDefinitionsAndExports: qualified_name is null.
762+
rows.push([relPath, 'file', relPath, 0, null, null, null, null, null]);
763+
for (const def of symbols.definitions ?? []) {
764+
// Populate qualified_name/scope the same way the JS fallback does so
765+
// downstream queries (cross-file references, "go to definition") find
766+
// these symbols.
767+
const dotIdx = def.name.lastIndexOf('.');
768+
const scope = dotIdx !== -1 ? def.name.slice(0, dotIdx) : null;
769+
rows.push([
770+
def.name,
771+
def.kind,
772+
relPath,
773+
def.line,
774+
def.endLine ?? null,
775+
null,
776+
def.name,
777+
scope,
778+
def.visibility ?? null,
779+
]);
780+
}
781+
// Exports: insert the row (INSERT OR IGNORE — a matching definition row
782+
// is a no-op) and queue a key for the second-pass exported=1 update, so
783+
// queries filtering on exported=1 find backfilled symbols (#970).
784+
for (const exp of symbols.exports ?? []) {
785+
rows.push([exp.name, exp.kind, relPath, exp.line, null, null, exp.name, null, null]);
786+
exportKeys.push([exp.name, exp.kind, relPath, exp.line]);
787+
}
788+
}
789+
const db = ctx.db as unknown as BetterSqlite3Database;
790+
batchInsertNodes(db, rows);
791+
792+
// Mark exported symbols in batches — mirrors insertDefinitionsAndExports.
793+
if (exportKeys.length > 0) {
794+
const EXPORT_CHUNK = 500;
795+
const exportStmtCache = new Map<number, SqliteStatement>();
796+
for (let i = 0; i < exportKeys.length; i += EXPORT_CHUNK) {
797+
const end = Math.min(i + EXPORT_CHUNK, exportKeys.length);
798+
const chunkSize = end - i;
799+
let updateStmt = exportStmtCache.get(chunkSize);
800+
if (!updateStmt) {
801+
const conditions = Array.from(
802+
{ length: chunkSize },
803+
() => '(name = ? AND kind = ? AND file = ? AND line = ?)',
804+
).join(' OR ');
805+
updateStmt = db.prepare(`UPDATE nodes SET exported = 1 WHERE ${conditions}`);
806+
exportStmtCache.set(chunkSize, updateStmt);
807+
}
808+
const vals: unknown[] = [];
809+
for (let j = i; j < end; j++) {
810+
const k = exportKeys[j] as unknown[];
811+
vals.push(k[0], k[1], k[2], k[3]);
812+
}
813+
updateStmt.run(...vals);
814+
}
815+
}
816+
}
817+
703818
// ── Pipeline stages execution ───────────────────────────────────────────
704819

705820
async function runPipelineStages(ctx: PipelineContext): Promise<void> {

src/domain/parser.ts

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,27 @@ export function isWasmAvailable(): boolean {
338338
);
339339
}
340340

341+
/**
342+
* Return the set of lowercase file extensions whose WASM grammar is actually
343+
* installed on disk. Used to scope engine-parity backfill to files that WASM
344+
* can recover — languages without an installed grammar are skipped by both
345+
* engines, so they don't represent a native-engine drop.
346+
*
347+
* Cached on first call; the grammars directory is shipped immutable.
348+
*/
349+
let _installedWasmExts: Set<string> | null = null;
350+
export function getInstalledWasmExtensions(): Set<string> {
351+
if (_installedWasmExts) return _installedWasmExts;
352+
const exts = new Set<string>();
353+
for (const entry of LANGUAGE_REGISTRY) {
354+
if (fs.existsSync(grammarPath(entry.grammarFile))) {
355+
for (const ext of entry.extensions) exts.add(ext.toLowerCase());
356+
}
357+
}
358+
_installedWasmExts = exts;
359+
return exts;
360+
}
361+
341362
// ── Unified API ──────────────────────────────────────────────────────────────
342363

343364
function resolveEngine(opts: ParseEngineOpts = {}): ResolvedEngine {
@@ -884,8 +905,10 @@ export async function parseFilesAuto(
884905
? native.parseFilesFull(filePaths, rootDir)
885906
: native.parseFiles(filePaths, rootDir, true, true);
886907
const needsTypeMap: { filePath: string; relPath: string }[] = [];
908+
const nativeParsed = new Set<string>();
887909
for (const r of nativeResults) {
888910
if (!r) continue;
911+
nativeParsed.add(r.file);
889912
const patched = patchNativeResult(r);
890913
const relPath = path.relative(rootDir, r.file).split(path.sep).join('/');
891914
result.set(relPath, patched);
@@ -901,6 +924,24 @@ export async function parseFilesAuto(
901924
if (needsTypeMap.length > 0) {
902925
await backfillTypeMapBatch(needsTypeMap, result);
903926
}
927+
928+
// Engine parity: native may silently drop files whose extensions are in
929+
// SUPPORTED_EXTENSIONS (because a WASM grammar exists) but whose Rust
930+
// extractor/grammar is missing or fails. WASM handles these — fall back so
931+
// both engines process the same file set (#967). Restrict to installed WASM
932+
// grammars so we don't warn about files that neither engine can parse.
933+
const installedExts = getInstalledWasmExtensions();
934+
const dropped = filePaths.filter(
935+
(f) => !nativeParsed.has(f) && installedExts.has(path.extname(f).toLowerCase()),
936+
);
937+
if (dropped.length > 0) {
938+
warn(`Native engine dropped ${dropped.length} file(s); falling back to WASM for parity`);
939+
const wasmResults = await parseFilesWasm(dropped, rootDir);
940+
for (const [relPath, symbols] of wasmResults) {
941+
result.set(relPath, symbols);
942+
}
943+
}
944+
904945
return result;
905946
}
906947

0 commit comments

Comments
 (0)